def test_combine_first(self): series = Series(common.makeIntIndex(20).astype(float), index=common.makeIntIndex(20)) series_copy = series * 2 series_copy[::2] = np.NaN # nothing used from the input combined = series.combine_first(series_copy) self.assert_(np.array_equal(combined, series)) # Holes filled from input combined = series_copy.combine_first(series) self.assert_(np.isfinite(combined).all()) self.assert_(np.array_equal(combined[::2], series[::2])) self.assert_(np.array_equal(combined[1::2], series_copy[1::2])) # mixed types index = common.makeStringIndex(20) floats = Series(common.randn(20), index=index) strings = Series(common.makeStringIndex(10), index=index[::2]) combined = strings.combine_first(floats) common.assert_dict_equal(strings, combined, compare_keys=False) common.assert_dict_equal(floats[1::2], combined, compare_keys=False) # corner case s = Series([1., 2, 3], index=[0, 1, 2]) result = s.combine_first(Series([], index=[])) assert_series_equal(s, result)
def setup(self, sort): level1 = tm.makeStringIndex(10).values level2 = tm.makeStringIndex(1000).values label1 = np.arange(10).repeat(1000) label2 = np.tile(np.arange(1000), 10) index2 = MultiIndex(levels=[level1, level2], labels=[label1, label2]) self.df_multi = DataFrame(np.random.randn(len(index2), 4), index=index2, columns=['A', 'B', 'C', 'D']) self.key1 = np.tile(level1.take(label1), 10) self.key2 = np.tile(level2.take(label2), 10) self.df = DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2}) self.df_key1 = DataFrame(np.random.randn(len(level1), 4), index=level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = DataFrame(np.random.randn(len(level2), 4), index=level2, columns=['A', 'B', 'C', 'D']) shuf = np.arange(100000) np.random.shuffle(shuf) self.df_shuf = self.df.reindex(self.df.index[shuf])
def test_combine_first(self): values = tm.makeIntIndex(20).values.astype(float) series = Series(values, index=tm.makeIntIndex(20)) series_copy = series * 2 series_copy[::2] = np.NaN # nothing used from the input combined = series.combine_first(series_copy) tm.assert_series_equal(combined, series) # Holes filled from input combined = series_copy.combine_first(series) assert np.isfinite(combined).all() tm.assert_series_equal(combined[::2], series[::2]) tm.assert_series_equal(combined[1::2], series_copy[1::2]) # mixed types index = tm.makeStringIndex(20) floats = Series(tm.randn(20), index=index) strings = Series(tm.makeStringIndex(10), index=index[::2]) combined = strings.combine_first(floats) tm.assert_series_equal(strings, combined.loc[index[::2]]) tm.assert_series_equal(floats[1::2].astype(object), combined.loc[index[1::2]]) # corner case s = Series([1., 2, 3], index=[0, 1, 2]) result = s.combine_first(Series([], index=[])) assert_series_equal(s, result)
def setup(self): N = 100000 np.random.seed(1234) self.int_unique = pd.Int64Index(np.arange(N * 5)) # cache is_unique self.int_unique.is_unique self.int = pd.Int64Index(np.arange(N).repeat(5)) self.float = pd.Float64Index(np.random.randn(N).repeat(5)) # Convenience naming. self.checked_add = pd.core.algorithms.checked_add_with_arr self.arr = np.arange(1000000) self.arrpos = np.arange(1000000) self.arrneg = np.arange(-1000000, 0) self.arrmixed = np.array([1, -1]).repeat(500000) self.strings = tm.makeStringIndex(100000) self.arr_nan = np.random.choice([True, False], size=1000000) self.arrmixed_nan = np.random.choice([True, False], size=1000000) # match self.uniques = tm.makeStringIndex(1000).values self.all = self.uniques.repeat(10)
def setup(self): N = 10000 K = 10 key1 = tm.makeStringIndex(N).values.repeat(K) key2 = tm.makeStringIndex(N).values.repeat(K) col_array = np.vstack([key1, key2, np.random.randn(N * K)]) col_array2 = col_array.copy() col_array2[:, :10000] = np.nan self.col_array_list = list(col_array)
def setup(self): index = tm.makeStringIndex(1000) columns = tm.makeStringIndex(30) self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) self.idx_scalar = index[100] self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 self.bool_obj_indexer = self.bool_indexer.astype(object)
def setup(self): N, K = 5000, 50 self.index = tm.makeStringIndex(N) self.columns = tm.makeStringIndex(K) frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) self.data = frame.to_dict() self.dict_list = frame.to_dict(orient='records') self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)}
def setup(self, other_cols, sep, na_rep, na_frac): N = 10 ** 5 mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac]) self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) if other_cols == 0: # str.cat self-concatenates only for others=None self.others = None else: self.others = DataFrame({i: tm.makeStringIndex(N).where(mask_gen()) for i in range(other_cols)})
def setup_cache(self): size = 10**6 data = {'int64_small': Series(np.random.randint(0, 100, size=size)), 'int64_large': Series(np.random.randint(0, 10000, size=size)), 'object_small': Series( tm.makeStringIndex(100).take( np.random.randint(0, 100, size=size))), 'object_large': Series( tm.makeStringIndex(10000).take( np.random.randint(0, 10000, size=size)))} return data
def setup(self): np.random.seed(1234) N, K = 5000, 50 self.index = tm.makeStringIndex(N) self.columns = tm.makeStringIndex(K) self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) self.data = self.frame.to_dict() self.some_dict = list(self.data.values())[0] self.dict_list = self.frame.to_dict(orient='records') self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)}
def setup(self): N = 100000 self.df = pd.DataFrame( {'A': pd.Series(tm.makeStringIndex(100).take( np.random.randint(0, 100, size=N))), 'B': pd.Series(tm.makeStringIndex(10000).take( np.random.randint(0, 10000, size=N))), 'D': np.random.randn(N), 'E': np.arange(N), 'F': pd.date_range('20110101', freq='s', periods=N), 'G': pd.timedelta_range('1 day', freq='s', periods=N), }) self.df['C'] = self.df['B'].astype('category') self.df.iloc[10:20] = np.nan
def setup(self): rng = date_range(start='1/1/1970', periods=10000, freq='1min') self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10)) self.df['foo'] = 'bar' self.rng_subset = Index(rng[::2]) self.df2 = DataFrame(index=range(10000), data=np.random.rand(10000, 30), columns=range(30)) N = 5000 K = 200 level1 = tm.makeStringIndex(N).values.repeat(K) level2 = np.tile(tm.makeStringIndex(K).values, N) index = MultiIndex.from_arrays([level1, level2]) self.s = Series(np.random.randn(N * K), index=index) self.s_subset = self.s[::2]
def setup(self, sort, dtype): N = 10**5 data = {'int': pd.Int64Index(np.arange(N).repeat(5)), 'uint': pd.UInt64Index(np.arange(N).repeat(5)), 'float': pd.Float64Index(np.random.randn(N).repeat(5)), 'string': tm.makeStringIndex(N).repeat(5)} self.idx = data[dtype]
def setUp(self): self.strIndex = tm.makeStringIndex(100) self.dateIndex = tm.makeDateIndex(100) self.intIndex = tm.makeIntIndex(100) self.floatIndex = tm.makeFloatIndex(100) self.empty = Index([]) self.tuples = Index(zip(["foo", "bar", "baz"], [1, 2, 3]))
def test_to_html(self): # big mixed biggie = DataFrame({'A': np.random.randn(200), 'B': tm.makeStringIndex(200)}, index=lrange(200)) biggie.loc[:20, 'A'] = np.nan biggie.loc[:20, 'B'] = np.nan s = biggie.to_html() buf = StringIO() retval = biggie.to_html(buf=buf) self.assertIsNone(retval) self.assertEqual(buf.getvalue(), s) tm.assertIsInstance(s, compat.string_types) biggie.to_html(columns=['B', 'A'], col_space=17) biggie.to_html(columns=['B', 'A'], formatters={'A': lambda x: '%.1f' % x}) biggie.to_html(columns=['B', 'A'], float_format=str) biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str) frame = DataFrame(index=np.arange(200)) frame.to_html()
def setup(self, lines_orient): N = 10**5 ncols = 5 index = date_range('20000101', periods=N, freq='H') timedeltas = timedelta_range(start=1, periods=N, freq='s') datetimes = date_range(start=1, periods=N, freq='s') ints = np.random.randint(100000000, size=N) floats = np.random.randn(N) strings = tm.makeStringIndex(N) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) self.df_td_int_ts = DataFrame({'td_1': timedeltas, 'td_2': timedeltas, 'int_1': ints, 'int_2': ints, 'ts_1': datetimes, 'ts_2': datetimes}, index=index) self.df_int_floats = DataFrame({'int_1': ints, 'int_2': ints, 'int_3': ints, 'float_1': floats, 'float_2': floats, 'float_3': floats}, index=index) self.df_int_float_str = DataFrame({'int_1': ints, 'int_2': ints, 'float_1': floats, 'float_2': floats, 'str_1': strings, 'str_2': strings}, index=index)
def setup_method(self, method): self.bool_index = tm.makeBoolIndex(10, name='a') self.int_index = tm.makeIntIndex(10, name='a') self.float_index = tm.makeFloatIndex(10, name='a') self.dt_index = tm.makeDateIndex(10, name='a') self.dt_tz_index = tm.makeDateIndex(10, name='a').tz_localize( tz='US/Eastern') self.period_index = tm.makePeriodIndex(10, name='a') self.string_index = tm.makeStringIndex(10, name='a') self.unicode_index = tm.makeUnicodeIndex(10, name='a') arr = np.random.randn(10) self.bool_series = Series(arr, index=self.bool_index, name='a') self.int_series = Series(arr, index=self.int_index, name='a') self.float_series = Series(arr, index=self.float_index, name='a') self.dt_series = Series(arr, index=self.dt_index, name='a') self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True) self.period_series = Series(arr, index=self.period_index, name='a') self.string_series = Series(arr, index=self.string_index, name='a') self.unicode_series = Series(arr, index=self.unicode_index, name='a') types = ['bool', 'int', 'float', 'dt', 'dt_tz', 'period', 'string', 'unicode'] self.indexes = [getattr(self, '{}_index'.format(t)) for t in types] self.series = [getattr(self, '{}_series'.format(t)) for t in types] self.objs = self.indexes + self.series
def test_repr_mixed_big(self): # big mixed biggie = DataFrame({"A": np.random.randn(200), "B": tm.makeStringIndex(200)}, index=lrange(200)) biggie.loc[:20, "A"] = nan biggie.loc[:20, "B"] = nan foo = repr(biggie) # noqa
def setup(self): n, k = 200, 5000 levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] codes = [np.random.choice(n, (k * n)) for lev in levels] self.mi = MultiIndex(levels=levels, codes=codes)
def setup(self): N = 25000 index = tm.makeStringIndex(N) self.df = DataFrame({'float1': np.random.randn(N), 'float2': np.random.randn(N)}, index=index) self.df_mixed = DataFrame({'float1': np.random.randn(N), 'float2': np.random.randn(N), 'string1': ['foo'] * N, 'bool1': [True] * N, 'int1': np.random.randint(0, N, size=N)}, index=index) self.df_wide = DataFrame(np.random.randn(N, 100)) self.start_wide = self.df_wide.index[10000] self.stop_wide = self.df_wide.index[15000] self.df2 = DataFrame({'float1': np.random.randn(N), 'float2': np.random.randn(N)}, index=date_range('1/1/2000', periods=N)) self.start = self.df2.index[10000] self.stop = self.df2.index[15000] self.df_wide2 = DataFrame(np.random.randn(N, 100), index=date_range('1/1/2000', periods=N)) self.df_dc = DataFrame(np.random.randn(N, 10), columns=['C%03d' % i for i in range(10)]) self.fname = '__test__.h5' self.store = HDFStore(self.fname) self.store.put('fixed', self.df) self.store.put('fixed_mixed', self.df_mixed) self.store.append('table', self.df2) self.store.append('table_mixed', self.df_mixed) self.store.append('table_wide', self.df_wide) self.store.append('table_wide2', self.df_wide2)
def test_to_html(self): # big mixed biggie = DataFrame({'A' : randn(200), 'B' : tm.makeStringIndex(200)}, index=range(200)) biggie['A'][:20] = nan biggie['B'][:20] = nan s = biggie.to_html() buf = StringIO() retval = biggie.to_html(buf=buf) self.assert_(retval is None) self.assertEqual(buf.getvalue(), s) self.assert_(isinstance(s, basestring)) biggie.to_html(columns=['B', 'A'], col_space=17) biggie.to_html(columns=['B', 'A'], formatters={'A' : lambda x: '%.1f' % x}) biggie.to_html(columns=['B', 'A'], float_format=str) biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str) frame = DataFrame(index=np.arange(200)) frame.to_html()
def test_to_html(self): # big mixed biggie = DataFrame({'A': np.random.randn(200), 'B': tm.makeStringIndex(200)}, index=lrange(200)) biggie.loc[:20, 'A'] = np.nan biggie.loc[:20, 'B'] = np.nan s = biggie.to_html() buf = StringIO() retval = biggie.to_html(buf=buf) assert retval is None assert buf.getvalue() == s assert isinstance(s, compat.string_types) biggie.to_html(columns=['B', 'A'], col_space=17) biggie.to_html(columns=['B', 'A'], formatters={'A': lambda x: '{x:.1f}'.format(x=x)}) biggie.to_html(columns=['B', 'A'], float_format=str) biggie.to_html(columns=['B', 'A'], col_space=12, float_format=str) frame = DataFrame(index=np.arange(200)) frame.to_html()
def setup_method(self, method): super(TestIndex, self).setup_method(method) self.d = { 'string': tm.makeStringIndex(100), 'date': tm.makeDateIndex(100), 'int': tm.makeIntIndex(100), 'rng': tm.makeRangeIndex(100), 'float': tm.makeFloatIndex(100), 'empty': Index([]), 'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])), 'period': Index(period_range('2012-1-1', freq='M', periods=3)), 'date2': Index(date_range('2013-01-1', periods=10)), 'bdate': Index(bdate_range('2013-01-02', periods=10)), 'cat': tm.makeCategoricalIndex(100), 'interval': tm.makeIntervalIndex(100), 'timedelta': tm.makeTimedeltaIndex(100, 'H') } self.mi = { 'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')], names=['first', 'second']), }
def setup(self): groups = tm.makeStringIndex(10).values self.left = DataFrame({'group': groups.repeat(5000), 'key': np.tile(np.arange(0, 10000, 2), 10), 'lvalue': np.random.randn(50000)}) self.right = DataFrame({'key': np.arange(10000), 'rvalue': np.random.randn(10000)})
def setup(self, index): N = 10**5 indexes = {'string': tm.makeStringIndex(N), 'datetime': date_range('1900', periods=N, freq='s')} index = indexes[index] self.s = Series(np.random.rand(N), index=index) self.lbl = index[80000]
def setUp(self): self.bool_index = tm.makeBoolIndex(10, name='a') self.int_index = tm.makeIntIndex(10, name='a') self.float_index = tm.makeFloatIndex(10, name='a') self.dt_index = tm.makeDateIndex(10, name='a') self.dt_tz_index = tm.makeDateIndex(10, name='a').tz_localize( tz='US/Eastern') self.period_index = tm.makePeriodIndex(10, name='a') self.string_index = tm.makeStringIndex(10, name='a') self.unicode_index = tm.makeUnicodeIndex(10, name='a') arr = np.random.randn(10) self.int_series = Series(arr, index=self.int_index, name='a') self.float_series = Series(arr, index=self.float_index, name='a') self.dt_series = Series(arr, index=self.dt_index, name='a') self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True) self.period_series = Series(arr, index=self.period_index, name='a') self.string_series = Series(arr, index=self.string_index, name='a') types = ['bool', 'int', 'float', 'dt', 'dt_tz', 'period', 'string', 'unicode'] fmts = ["{0}_{1}".format(t, f) for t in types for f in ['index', 'series']] self.objs = [getattr(self, f) for f in fmts if getattr(self, f, None) is not None]
def test_invalid_index_types(self): # test all index types for i in [tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10)]: self.assertRaises(TypeError, lambda: infer_freq(i)) for i in [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]: self.assertRaises(ValueError, lambda: infer_freq(i))
def setup(self): n = 50000 indices = tm.makeStringIndex(n) subsample_size = 40000 self.x = Series(np.random.randn(n), indices) self.y = Series(np.random.randn(subsample_size), index=np.random.choice(indices, subsample_size, replace=False))
def setup(self, sort, dtype): N = 10**5 data = {'int': pd.Int64Index(np.arange(N)), 'uint': pd.UInt64Index(np.arange(N)), 'float': pd.Float64Index(np.arange(N)), 'string': tm.makeStringIndex(N)} self.idx = data[dtype] assert self.idx.is_unique
def setup(self): self.fname = '__test__.msg' N = 100000 C = 5 self.df = DataFrame(np.random.randn(N, C), columns=['float{}'.format(i) for i in range(C)], index=date_range('20000101', periods=N, freq='H')) self.df['object'] = tm.makeStringIndex(N) self.df.to_msgpack(self.fname)
def setup_method(self, method): super(TestIndex, self).setup_method(method) self.d = { 'string': tm.makeStringIndex(100), 'date': tm.makeDateIndex(100), 'int': tm.makeIntIndex(100), 'rng': tm.makeRangeIndex(100), 'float': tm.makeFloatIndex(100), 'empty': Index([]), 'tuple': Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])), 'period': Index(period_range('2012-1-1', freq='M', periods=3)), 'date2': Index(date_range('2013-01-1', periods=10)), 'bdate': Index(bdate_range('2013-01-02', periods=10)), 'cat': tm.makeCategoricalIndex(100) } self.mi = { 'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')], names=['first', 'second']), }
def setup(self): n1 = 400 n2 = 250 index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)], labels=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1], names=['lev1', 'lev2']) arr = np.random.randn(n1 * n2, 3) arr[::10000, 0] = np.nan arr[1::10000, 1] = np.nan arr[2::10000, 2] = np.nan data = DataFrame(arr, index=index, columns=['col1', 'col20', 'col3']) self.df = data n = 20000 self.df1 = DataFrame(np.random.randint(1, n, (n, 3)), columns=['jim', 'joe', 'jolie']) self.df2 = self.df1.copy() self.df2['jim'] = self.df2['joe'] self.df3 = DataFrame(np.random.randint(1, (n / 10), (n, 3)), columns=['jim', 'joe', 'jolie']) self.df4 = self.df3.copy() self.df4['jim'] = self.df4['joe']
def setup(self, repeats): N = 10 ** 5 self.s = Series(tm.makeStringIndex(N)) repeat = {"int": 1, "array": np.random.randint(1, 3, N)} self.values = repeat[repeats]
def run_command(key): pywren.wrenlogging.default_config('INFO') logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) logger.info("before everything") logger.info(key) t_s = time.time() partition_num = key['partition_num'] rounds = key['rounds'] #em = key['em'] taskId = key['taskId'] appName = key['appName'] # partition_num = 1 # rounds = 8 # #em = key['em'] # taskId = 1 # appName = 'test-1' em = JiffyClient(host=key['em']) logger.info("berfore queue") data_ques, msg_que = open_or_create_jiffy_queues( em, appName, partition_num, 1, 'sender') logger.info("queue opend") for i in range(rounds): ### dd = read_s3_table(key) msg = create_msg(rounds, taskId, i, partition_num) ######### create a table here to replace the input right_table = 100000 indices = tm.makeStringIndex(right_table).values key = np.tile(indices[:right_table], 1) right = DataFrame({ "key": key, "value": np.random.randn(right_table) }) logger.info("Finish generating data") x = 0 if i == rounds - 1: x = 1 encoded = right.to_csv(sep="|", header=False, index=False).encode('utf-8') a = np.random.randint(1, 10, 500000) encoded = np.asarray(a).astype('S100').tobytes() # print(sys.getsizeof(encoded)) data_path = "/" + appName + "/" + '01' test_que = em.open_or_create_queue(data_path, "local://tmp", 10, 1) logger.info("get encoded size" + str(sys.getsizeof(encoded))) ta = time.time() test_que.put(encoded) tb = time.time() logger.info("wirte takes" + str(tb - ta)) logger.info("before get") obj = test_que.get() logger.info("get obj of size" + str(sys.getsizeof(obj))) tc = time.time() logger.info("get takes " + str(tc - tb)) # data_ques[0].put(encoded) # logger.info("wirte finished") # logger.info("before get") # obj = data_ques[0].get() #res = write_jiffy_partitions(right, ['key'], 'uniform', partition_num, data_ques, msg_que = msg_que, msg = msg, fin = x) t_f = time.time() # share.append([t_s,t_f]) return ([t_s, t_f])
def setUp(self): self.strIndex = tm.makeStringIndex(100) self.dateIndex = tm.makeDateIndex(100) self.intIndex = tm.makeIntIndex(100) self.empty = Index([]) self.tuples = Index(zip(['foo', 'bar', 'baz'], [1, 2, 3]))
def setup(self, repeats): N = 10**5 self.s = Series(tm.makeStringIndex(N)) repeat = {'int': 1, 'array': np.random.randint(1, 3, N)} self.repeat = repeat[repeats]
def setup(self, expand): self.s = Series(tm.makeStringIndex(10**5)).str.join('--')
def setup(self, errors): N = 10000 self.float = Series(np.random.randn(N)) self.numstr = self.float.astype('str') self.str = Series(tm.makeStringIndex(N))
def setup(self): self.s = Series(index=tm.makeStringIndex(10000))
class TestSeriesMisc: def test_scalarop_preserve_name(self, datetime_series): result = datetime_series * 2 assert result.name == datetime_series.name def test_copy_name(self, datetime_series): result = datetime_series.copy() assert result.name == datetime_series.name def test_copy_index_name_checking(self, datetime_series): # don't want to be able to modify the index stored elsewhere after # making a copy datetime_series.index.name = None assert datetime_series.index.name is None assert datetime_series is datetime_series cp = datetime_series.copy() cp.index.name = "foo" printing.pprint_thing(datetime_series.index.name) assert datetime_series.index.name is None def test_append_preserve_name(self, datetime_series): result = datetime_series[:5].append(datetime_series[5:]) assert result.name == datetime_series.name def test_binop_maybe_preserve_name(self, datetime_series): # names match, preserve result = datetime_series * datetime_series assert result.name == datetime_series.name result = datetime_series.mul(datetime_series) assert result.name == datetime_series.name result = datetime_series * datetime_series[:-2] assert result.name == datetime_series.name # names don't match, don't preserve cp = datetime_series.copy() cp.name = "something else" result = datetime_series + cp assert result.name is None result = datetime_series.add(cp) assert result.name is None ops = ["add", "sub", "mul", "div", "truediv", "floordiv", "mod", "pow"] ops = ops + ["r" + op for op in ops] for op in ops: # names match, preserve s = datetime_series.copy() result = getattr(s, op)(s) assert result.name == datetime_series.name # names don't match, don't preserve cp = datetime_series.copy() cp.name = "changed" result = getattr(s, op)(cp) assert result.name is None def test_combine_first_name(self, datetime_series): result = datetime_series.combine_first(datetime_series[:5]) assert result.name == datetime_series.name def test_getitem_preserve_name(self, datetime_series): result = datetime_series[datetime_series > 0] assert result.name == datetime_series.name result = datetime_series[[0, 2, 4]] assert result.name == datetime_series.name result = datetime_series[5:10] assert result.name == datetime_series.name def test_pickle_datetimes(self, datetime_series): unp_ts = self._pickle_roundtrip(datetime_series) tm.assert_series_equal(unp_ts, datetime_series) def test_pickle_strings(self, string_series): unp_series = self._pickle_roundtrip(string_series) tm.assert_series_equal(unp_series, string_series) def _pickle_roundtrip(self, obj): with tm.ensure_clean() as path: obj.to_pickle(path) unpickled = pd.read_pickle(path) return unpickled def test_argsort_preserve_name(self, datetime_series): result = datetime_series.argsort() assert result.name == datetime_series.name def test_sort_index_name(self, datetime_series): result = datetime_series.sort_index(ascending=False) assert result.name == datetime_series.name def test_constructor_dict(self): d = {"a": 0.0, "b": 1.0, "c": 2.0} result = Series(d) expected = Series(d, index=sorted(d.keys())) tm.assert_series_equal(result, expected) result = Series(d, index=["b", "c", "d", "a"]) expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) tm.assert_series_equal(result, expected) def test_constructor_subclass_dict(self): data = tm.TestSubDict((x, 10.0 * x) for x in range(10)) series = Series(data) expected = Series(dict(data.items())) tm.assert_series_equal(series, expected) def test_constructor_ordereddict(self): # GH3283 data = OrderedDict( ("col{i}".format(i=i), np.random.random()) for i in range(12) ) series = Series(data) expected = Series(list(data.values()), list(data.keys())) tm.assert_series_equal(series, expected) # Test with subclass class A(OrderedDict): pass series = Series(A(data)) tm.assert_series_equal(series, expected) def test_constructor_dict_multiindex(self): d = {("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0} _d = sorted(d.items()) result = Series(d) expected = Series( [x[1] for x in _d], index=pd.MultiIndex.from_tuples([x[0] for x in _d]) ) tm.assert_series_equal(result, expected) d["z"] = 111.0 _d.insert(0, ("z", d["z"])) result = Series(d) expected = Series( [x[1] for x in _d], index=pd.Index([x[0] for x in _d], tupleize_cols=False) ) result = result.reindex(index=expected.index) tm.assert_series_equal(result, expected) def test_constructor_dict_timedelta_index(self): # GH #12169 : Resample category data with timedelta index # construct Series from dict as data and TimedeltaIndex as index # will result NaN in result Series data expected = Series( data=["A", "B", "C"], index=pd.to_timedelta([0, 10, 20], unit="s") ) result = Series( data={ pd.to_timedelta(0, unit="s"): "A", pd.to_timedelta(10, unit="s"): "B", pd.to_timedelta(20, unit="s"): "C", }, index=pd.to_timedelta([0, 10, 20], unit="s"), ) tm.assert_series_equal(result, expected) def test_sparse_accessor_updates_on_inplace(self): s = pd.Series([1, 1, 2, 3], dtype="Sparse[int]") s.drop([0, 1], inplace=True) assert s.sparse.density == 1.0 def test_tab_completion(self): # GH 9910 s = Series(list("abcd")) # Series of str values should have .str but not .dt/.cat in __dir__ assert "str" in dir(s) assert "dt" not in dir(s) assert "cat" not in dir(s) # similarly for .dt s = Series(date_range("1/1/2015", periods=5)) assert "dt" in dir(s) assert "str" not in dir(s) assert "cat" not in dir(s) # Similarly for .cat, but with the twist that str and dt should be # there if the categories are of that type first cat and str. s = Series(list("abbcd"), dtype="category") assert "cat" in dir(s) assert "str" in dir(s) # as it is a string categorical assert "dt" not in dir(s) # similar to cat and str s = Series(date_range("1/1/2015", periods=5)).astype("category") assert "cat" in dir(s) assert "str" not in dir(s) assert "dt" in dir(s) # as it is a datetime categorical def test_tab_completion_with_categorical(self): # test the tab completion display ok_for_cat = [ "categories", "codes", "ordered", "set_categories", "add_categories", "remove_categories", "rename_categories", "reorder_categories", "remove_unused_categories", "as_ordered", "as_unordered", ] def get_dir(s): results = [r for r in s.cat.__dir__() if not r.startswith("_")] return sorted(set(results)) s = Series(list("aabbcde")).astype("category") results = get_dir(s) tm.assert_almost_equal(results, sorted(set(ok_for_cat))) @pytest.mark.parametrize( "index", [ tm.makeUnicodeIndex(10), tm.makeStringIndex(10), tm.makeCategoricalIndex(10), Index(["foo", "bar", "baz"] * 2), tm.makeDateIndex(10), tm.makePeriodIndex(10), tm.makeTimedeltaIndex(10), tm.makeIntIndex(10), tm.makeUIntIndex(10), tm.makeIntIndex(10), tm.makeFloatIndex(10), Index([True, False]), Index(["a{}".format(i) for i in range(101)]), pd.MultiIndex.from_tuples(zip("ABCD", "EFGH")), pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], "EFGH")), ], ) def test_index_tab_completion(self, index): # dir contains string-like values of the Index. s = pd.Series(index=index) dir_s = dir(s) for i, x in enumerate(s.index.unique(level=0)): if i < 100: assert not isinstance(x, str) or not x.isidentifier() or x in dir_s else: assert x not in dir_s def test_not_hashable(self): s_empty = Series() s = Series([1]) msg = "'Series' objects are mutable, thus they cannot be hashed" with pytest.raises(TypeError, match=msg): hash(s_empty) with pytest.raises(TypeError, match=msg): hash(s) def test_contains(self, datetime_series): tm.assert_contains_all(datetime_series.index, datetime_series) def test_iter_datetimes(self, datetime_series): for i, val in enumerate(datetime_series): assert val == datetime_series[i] def test_iter_strings(self, string_series): for i, val in enumerate(string_series): assert val == string_series[i] def test_keys(self, datetime_series): # HACK: By doing this in two stages, we avoid 2to3 wrapping the call # to .keys() in a list() getkeys = datetime_series.keys assert getkeys() is datetime_series.index def test_values(self, datetime_series): tm.assert_almost_equal( datetime_series.values, datetime_series, check_dtype=False ) def test_iteritems_datetimes(self, datetime_series): for idx, val in datetime_series.iteritems(): assert val == datetime_series[idx] def test_iteritems_strings(self, string_series): for idx, val in string_series.iteritems(): assert val == string_series[idx] # assert is lazy (genrators don't define reverse, lists do) assert not hasattr(string_series.iteritems(), "reverse") def test_items_datetimes(self, datetime_series): for idx, val in datetime_series.items(): assert val == datetime_series[idx] def test_items_strings(self, string_series): for idx, val in string_series.items(): assert val == string_series[idx] # assert is lazy (genrators don't define reverse, lists do) assert not hasattr(string_series.items(), "reverse") def test_raise_on_info(self): s = Series(np.random.randn(10)) msg = "'Series' object has no attribute 'info'" with pytest.raises(AttributeError, match=msg): s.info() def test_copy(self): for deep in [None, False, True]: s = Series(np.arange(10), dtype="float64") # default deep is True if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) s2[::2] = np.NaN if deep is None or deep is True: # Did not modify original Series assert np.isnan(s2[0]) assert not np.isnan(s[0]) else: # we DID modify the original Series assert np.isnan(s2[0]) assert np.isnan(s[0]) def test_copy_tzaware(self): # GH#11794 # copy of tz-aware expected = Series([Timestamp("2012/01/01", tz="UTC")]) expected2 = Series([Timestamp("1999/01/01", tz="UTC")]) for deep in [None, False, True]: s = Series([Timestamp("2012/01/01", tz="UTC")]) if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) s2[0] = pd.Timestamp("1999/01/01", tz="UTC") # default deep is True if deep is None or deep is True: # Did not modify original Series tm.assert_series_equal(s2, expected2) tm.assert_series_equal(s, expected) else: # we DID modify the original Series tm.assert_series_equal(s2, expected2) tm.assert_series_equal(s, expected2) def test_axis_alias(self): s = Series([1, 2, np.nan]) tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index")) assert s.dropna().sum("rows") == 3 assert s._get_axis_number("rows") == 0 assert s._get_axis_name("rows") == "index" def test_class_axis(self): # https://github.com/pandas-dev/pandas/issues/18147 # no exception and no empty docstring assert pydoc.getdoc(Series.index) def test_numpy_unique(self, datetime_series): # it works! np.unique(datetime_series) def test_ndarray_compat(self): # test numpy compat with Series as sub-class of NDFrame tsdf = DataFrame( np.random.randn(1000, 3), columns=["A", "B", "C"], index=date_range("1/1/2000", periods=1000), ) def f(x): return x[x.idxmax()] result = tsdf.apply(f) expected = tsdf.max() tm.assert_series_equal(result, expected) # .item() with tm.assert_produces_warning(FutureWarning): s = Series([1]) result = s.item() assert result == 1 assert s.item() == s.iloc[0] # using an ndarray like function s = Series(np.random.randn(10)) result = Series(np.ones_like(s)) expected = Series(1, index=range(10), dtype="float64") tm.assert_series_equal(result, expected) # ravel s = Series(np.random.randn(10)) tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) # compress # GH 6658 s = Series([0, 1.0, -1], index=list("abc")) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.0], index=["b"])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Index(dtype=object) as the same as original exp = Series([], dtype="float64", index=Index([], dtype="object")) tm.assert_series_equal(result, exp) s = Series([0, 1.0, -1], index=[0.1, 0.2, 0.3]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.0], index=[0.2])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Float64Index as the same as original exp = Series([], dtype="float64", index=Index([], dtype="float64")) tm.assert_series_equal(result, exp) def test_str_accessor_updates_on_inplace(self): s = pd.Series(list("abc")) s.drop([0], inplace=True) assert len(s.str.lower()) == 2 def test_str_attribute(self): # GH9068 methods = ["strip", "rstrip", "lstrip"] s = Series([" jack", "jill ", " jesse ", "frank"]) for method in methods: expected = Series([getattr(str, method)(x) for x in s.values]) tm.assert_series_equal(getattr(Series.str, method)(s.str), expected) # str accessor only valid with string values s = Series(range(5)) with pytest.raises(AttributeError, match="only use .str accessor"): s.str.repeat(2) def test_empty_method(self): s_empty = pd.Series() assert s_empty.empty for full_series in [pd.Series([1]), pd.Series(index=[1])]: assert not full_series.empty def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; s = pd.Series()" ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("s.", 1)) def test_integer_series_size(self): # GH 25580 s = Series(range(9)) assert s.size == 9 s = Series(range(9), dtype="Int64") assert s.size == 9 def test_get_values_deprecation(self): s = Series(range(9)) with tm.assert_produces_warning(FutureWarning): res = s.get_values() tm.assert_numpy_array_equal(res, s.values)
import numpy as np import pytest import pandas as pd from pandas.core.indexes.api import Index, MultiIndex import pandas.util.testing as tm @pytest.fixture(params=[tm.makeUnicodeIndex(100), tm.makeStringIndex(100), tm.makeDateIndex(100), tm.makePeriodIndex(100), tm.makeTimedeltaIndex(100), tm.makeIntIndex(100), tm.makeUIntIndex(100), tm.makeRangeIndex(100), tm.makeFloatIndex(100), Index([True, False]), tm.makeCategoricalIndex(100), Index([]), MultiIndex.from_tuples(zip( ['foo', 'bar', 'baz'], [1, 2, 3])), Index([0, 0, 1, 1, 2, 2])], ids=lambda x: type(x).__name__) def indices(request): return request.param @pytest.fixture(params=[1, np.array(1, dtype=np.int64)]) def one(request): # zero-dim integer array behaves like an integer
class TestSeriesMisc(TestData, SharedWithSparse): series_klass = Series # SharedWithSparse tests use generic, series_klass-agnostic assertion _assert_series_equal = staticmethod(tm.assert_series_equal) def test_tab_completion(self): # GH 9910 s = Series(list("abcd")) # Series of str values should have .str but not .dt/.cat in __dir__ assert "str" in dir(s) assert "dt" not in dir(s) assert "cat" not in dir(s) # similarly for .dt s = Series(date_range("1/1/2015", periods=5)) assert "dt" in dir(s) assert "str" not in dir(s) assert "cat" not in dir(s) # Similarly for .cat, but with the twist that str and dt should be # there if the categories are of that type first cat and str. s = Series(list("abbcd"), dtype="category") assert "cat" in dir(s) assert "str" in dir(s) # as it is a string categorical assert "dt" not in dir(s) # similar to cat and str s = Series(date_range("1/1/2015", periods=5)).astype("category") assert "cat" in dir(s) assert "str" not in dir(s) assert "dt" in dir(s) # as it is a datetime categorical def test_tab_completion_with_categorical(self): # test the tab completion display ok_for_cat = [ "name", "index", "categorical", "categories", "codes", "ordered", "set_categories", "add_categories", "remove_categories", "rename_categories", "reorder_categories", "remove_unused_categories", "as_ordered", "as_unordered", ] def get_dir(s): results = [r for r in s.cat.__dir__() if not r.startswith("_")] return list(sorted(set(results))) s = Series(list("aabbcde")).astype("category") results = get_dir(s) tm.assert_almost_equal(results, list(sorted(set(ok_for_cat)))) @pytest.mark.parametrize( "index", [ tm.makeUnicodeIndex(10), tm.makeStringIndex(10), tm.makeCategoricalIndex(10), Index(["foo", "bar", "baz"] * 2), tm.makeDateIndex(10), tm.makePeriodIndex(10), tm.makeTimedeltaIndex(10), tm.makeIntIndex(10), tm.makeUIntIndex(10), tm.makeIntIndex(10), tm.makeFloatIndex(10), Index([True, False]), Index(["a{}".format(i) for i in range(101)]), pd.MultiIndex.from_tuples(zip("ABCD", "EFGH")), pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], "EFGH")), ], ) def test_index_tab_completion(self, index): # dir contains string-like values of the Index. s = pd.Series(index=index) dir_s = dir(s) for i, x in enumerate(s.index.unique(level=0)): if i < 100: assert not isinstance( x, str) or not x.isidentifier() or x in dir_s else: assert x not in dir_s def test_not_hashable(self): s_empty = Series() s = Series([1]) msg = "'Series' objects are mutable, thus they cannot be hashed" with pytest.raises(TypeError, match=msg): hash(s_empty) with pytest.raises(TypeError, match=msg): hash(s) def test_contains(self): tm.assert_contains_all(self.ts.index, self.ts) def test_iter(self): for i, val in enumerate(self.series): assert val == self.series[i] for i, val in enumerate(self.ts): assert val == self.ts[i] def test_keys(self): # HACK: By doing this in two stages, we avoid 2to3 wrapping the call # to .keys() in a list() getkeys = self.ts.keys assert getkeys() is self.ts.index def test_values(self): tm.assert_almost_equal(self.ts.values, self.ts, check_dtype=False) def test_iteritems(self): for idx, val in self.series.iteritems(): assert val == self.series[idx] for idx, val in self.ts.iteritems(): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) assert not hasattr(self.series.iteritems(), "reverse") def test_items(self): for idx, val in self.series.items(): assert val == self.series[idx] for idx, val in self.ts.items(): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) assert not hasattr(self.series.items(), "reverse") def test_raise_on_info(self): s = Series(np.random.randn(10)) msg = "'Series' object has no attribute 'info'" with pytest.raises(AttributeError, match=msg): s.info() def test_copy(self): for deep in [None, False, True]: s = Series(np.arange(10), dtype="float64") # default deep is True if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) s2[::2] = np.NaN if deep is None or deep is True: # Did not modify original Series assert np.isnan(s2[0]) assert not np.isnan(s[0]) else: # we DID modify the original Series assert np.isnan(s2[0]) assert np.isnan(s[0]) def test_copy_tzaware(self): # GH#11794 # copy of tz-aware expected = Series([Timestamp("2012/01/01", tz="UTC")]) expected2 = Series([Timestamp("1999/01/01", tz="UTC")]) for deep in [None, False, True]: s = Series([Timestamp("2012/01/01", tz="UTC")]) if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) s2[0] = pd.Timestamp("1999/01/01", tz="UTC") # default deep is True if deep is None or deep is True: # Did not modify original Series assert_series_equal(s2, expected2) assert_series_equal(s, expected) else: # we DID modify the original Series assert_series_equal(s2, expected2) assert_series_equal(s, expected2) def test_axis_alias(self): s = Series([1, 2, np.nan]) assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index")) assert s.dropna().sum("rows") == 3 assert s._get_axis_number("rows") == 0 assert s._get_axis_name("rows") == "index" def test_class_axis(self): # https://github.com/pandas-dev/pandas/issues/18147 # no exception and no empty docstring assert pydoc.getdoc(Series.index) def test_numpy_unique(self): # it works! np.unique(self.ts) def test_ndarray_compat(self): # test numpy compat with Series as sub-class of NDFrame tsdf = DataFrame( np.random.randn(1000, 3), columns=["A", "B", "C"], index=date_range("1/1/2000", periods=1000), ) def f(x): return x[x.idxmax()] result = tsdf.apply(f) expected = tsdf.max() tm.assert_series_equal(result, expected) # .item() with tm.assert_produces_warning(FutureWarning): s = Series([1]) result = s.item() assert result == 1 assert s.item() == s.iloc[0] # using an ndarray like function s = Series(np.random.randn(10)) result = Series(np.ones_like(s)) expected = Series(1, index=range(10), dtype="float64") tm.assert_series_equal(result, expected) # ravel s = Series(np.random.randn(10)) tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) # compress # GH 6658 s = Series([0, 1.0, -1], index=list("abc")) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.0], index=["b"])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Index(dtype=object) as the same as original exp = Series([], dtype="float64", index=Index([], dtype="object")) tm.assert_series_equal(result, exp) s = Series([0, 1.0, -1], index=[0.1, 0.2, 0.3]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.0], index=[0.2])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Float64Index as the same as original exp = Series([], dtype="float64", index=Index([], dtype="float64")) tm.assert_series_equal(result, exp) def test_str_accessor_updates_on_inplace(self): s = pd.Series(list("abc")) s.drop([0], inplace=True) assert len(s.str.lower()) == 2 def test_str_attribute(self): # GH9068 methods = ["strip", "rstrip", "lstrip"] s = Series([" jack", "jill ", " jesse ", "frank"]) for method in methods: expected = Series([getattr(str, method)(x) for x in s.values]) assert_series_equal(getattr(Series.str, method)(s.str), expected) # str accessor only valid with string values s = Series(range(5)) with pytest.raises(AttributeError, match="only use .str accessor"): s.str.repeat(2) def test_empty_method(self): s_empty = pd.Series() assert s_empty.empty for full_series in [pd.Series([1]), pd.Series(index=[1])]: assert not full_series.empty def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; s = pd.Series()" ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("s.", 1)) def test_integer_series_size(self): # GH 25580 s = Series(range(9)) assert s.size == 9 s = Series(range(9), dtype="Int64") assert s.size == 9 def test_get_values_deprecation(self): s = Series(range(9)) with tm.assert_produces_warning(FutureWarning): res = s.get_values() tm.assert_numpy_array_equal(res, s.values)
def setup(self): N = 10**4 self.iterables = [tm.makeStringIndex(N), range(20)]
def setup(self, regex): self.s = Series(tm.makeStringIndex(10**5))
def run_command(key): pywren.wrenlogging.default_config('INFO') logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) logger.info("before everything") partition_num = key['partition_num'] rounds = key['rounds'] em = JiffyClient(host=key['em']) reduceId = key['taskId'] appName = key['appName'] alg_type = key['type'] data_ques1 = open_or_create_jiffy_queues(em, appName, partition_num, 1, 'receiver') logger.info("queue opened") names = key['names'] dtypes = key['dtypes'] ############# left table left_table = 100000 indices = tm.makeStringIndex(left_table).values key = np.tile(indices[:left_table], 1) left = DataFrame({"key": key, "value": np.random.randn(left_table)}) t_start = time.time() ############### initialize join functions # print(left) lim = 0 ############## keeps fetching fin_num = 0 if alg_type == 'pipelined': leftsorter = None leftcount = None orizer = None intrizer = None count = 0 while fin_num < partition_num and lim < 15: #### read table lim += 1 time.sleep(0.01) logger.info("before get") obj = data_ques1[0].get() if sys.getsizeof(obj) > 1000: part_data = pd.read_table(BytesIO(obj), header=None, delimiter="|", names=['key', 'value2']) # ds, fin_num = read_jiffy_splits(names, dtypes, reduceId, data_ques1, fin_num, batch_size = 1, fin_size = partition_num) logger.info(ds) logger.info(fin_num) # print(fin_num) if len(ds) > 0: ### join # start = timeit.default_timer() result, orizer, intrizer, leftsorter, leftcount = pipeline_merge( left, ds, factorizer=orizer, intfactorizer=intrizer, leftsorter=leftsorter, leftcount=leftcount, slices=8, how="pipeline") time.sleep(0.8) logger.info("merged") # end = timeit.default_timer() # count += (end - start) # logger.info(str(i) + " chunks take time " + str(end - start) + " Accum time: " + str(count)) elif alg_type == 'origin': ds = pd.DataFrame() while fin_num < partition_num and lim < 1500: lim += 1 #### read table dd, fin_num = read_jiffy_splits(names, dtypes, reduceId, data_ques1, fin_num, batch_size=1, fin_size=partition_num) if len(dd) > 0: ds = ds.append(dd) print("this is ds:") print(ds) result = merge(left, ds, how="inner") print(fin_num) t_fin = time.time() # share.append([t_start,t_fin, fin_num]) return ([t_fin, t_start])
def setup(self): self.s = Series(tm.makeStringIndex(10**5)).str.join('|')
def setup(self): self.uniques = tm.makeStringIndex(1000).values self.all = self.uniques.repeat(10)
def test_duplicates(idx): assert not idx.has_duplicates assert idx.append(idx).has_duplicates index = MultiIndex(levels=[[0, 1], [0, 1, 2]], labels=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]) assert index.has_duplicates # GH 9075 t = [(u('x'), u('out'), u('z'), 5, u('y'), u('in'), u('z'), 169), (u('x'), u('out'), u('z'), 7, u('y'), u('in'), u('z'), 119), (u('x'), u('out'), u('z'), 9, u('y'), u('in'), u('z'), 135), (u('x'), u('out'), u('z'), 13, u('y'), u('in'), u('z'), 145), (u('x'), u('out'), u('z'), 14, u('y'), u('in'), u('z'), 158), (u('x'), u('out'), u('z'), 16, u('y'), u('in'), u('z'), 122), (u('x'), u('out'), u('z'), 17, u('y'), u('in'), u('z'), 160), (u('x'), u('out'), u('z'), 18, u('y'), u('in'), u('z'), 180), (u('x'), u('out'), u('z'), 20, u('y'), u('in'), u('z'), 143), (u('x'), u('out'), u('z'), 21, u('y'), u('in'), u('z'), 128), (u('x'), u('out'), u('z'), 22, u('y'), u('in'), u('z'), 129), (u('x'), u('out'), u('z'), 25, u('y'), u('in'), u('z'), 111), (u('x'), u('out'), u('z'), 28, u('y'), u('in'), u('z'), 114), (u('x'), u('out'), u('z'), 29, u('y'), u('in'), u('z'), 121), (u('x'), u('out'), u('z'), 31, u('y'), u('in'), u('z'), 126), (u('x'), u('out'), u('z'), 32, u('y'), u('in'), u('z'), 155), (u('x'), u('out'), u('z'), 33, u('y'), u('in'), u('z'), 123), (u('x'), u('out'), u('z'), 12, u('y'), u('in'), u('z'), 144)] index = pd.MultiIndex.from_tuples(t) assert not index.has_duplicates # handle int64 overflow if possible def check(nlevels, with_nulls): labels = np.tile(np.arange(500), 2) level = np.arange(500) if with_nulls: # inject some null values labels[500] = -1 # common nan value labels = [labels.copy() for i in range(nlevels)] for i in range(nlevels): labels[i][500 + i - nlevels // 2] = -1 labels += [np.array([-1, 1]).repeat(500)] else: labels = [labels] * nlevels + [np.arange(2).repeat(500)] levels = [level] * nlevels + [[0, 1]] # no dups index = MultiIndex(levels=levels, labels=labels) assert not index.has_duplicates # with a dup if with_nulls: def f(a): return np.insert(a, 1000, a[0]) labels = list(map(f, labels)) index = MultiIndex(levels=levels, labels=labels) else: values = index.values.tolist() index = MultiIndex.from_tuples(values + [values[0]]) assert index.has_duplicates # no overflow check(4, False) check(4, True) # overflow possible check(8, False) check(8, True) # GH 9125 n, k = 200, 5000 levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] labels = [np.random.choice(n, k * n) for lev in levels] mi = MultiIndex(levels=levels, labels=labels) for keep in ['first', 'last', False]: left = mi.duplicated(keep=keep) right = pd._libs.hashtable.duplicated_object(mi.values, keep=keep) tm.assert_numpy_array_equal(left, right) # GH5873 for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates with warnings.catch_warnings(record=True): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype='bool')) for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape # all possible unique combinations, including nan lab = product(range(-1, n), range(-1, m)) mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]], labels=np.random.permutation(list(lab)).T) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates with warnings.catch_warnings(record=True): # Deprecated - see GH20239 assert mi.get_duplicates().equals( MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(len(mi), dtype='bool'))
def setup(self): N = 10000 K = 10 self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K), 'key2': tm.makeStringIndex(N).values.repeat(K), 'value': np.random.randn(N * K)})
def setUp(self): self.strIndex = common.makeStringIndex(100) self.dateIndex = common.makeDateIndex(100) self.intIndex = common.makeIntIndex(100)
import string import numpy as np from pandas import DataFrame, MultiIndex, Series, concat, date_range, merge, merge_asof from pandas import pipeline_merge import pandas.util.testing as tm try: from pandas import merge_ordered except ImportError: from pandas import ordered_merge as merge_ordered N = 10000000 pieces = 10 indices = tm.makeStringIndex(N).values indices2 = tm.makeStringIndex(N).values key = np.tile(indices[:500000], 1) key2 = np.tile(indices2[:500000], 1) left = DataFrame({"key": key, "value": np.random.randn(500000)}) right = {} right = DataFrame({ "key": indices[1 * 100000 + 50000:9 * 100000 + 50000], "value2": np.random.randn(800000), }) result = pipeline_merge(left, right, how="pipeline") # def time_merge_dataframe_integer_2key(self, sort): # pipeline_merge(self.df, self.df3, how="pipeline") # # def time_merge_dataframe_integer_key(self, sort):
class TestSeriesMisc(TestData, SharedWithSparse): series_klass = Series # SharedWithSparse tests use generic, series_klass-agnostic assertion _assert_series_equal = staticmethod(tm.assert_series_equal) def test_tab_completion(self): # GH 9910 s = Series(list('abcd')) # Series of str values should have .str but not .dt/.cat in __dir__ assert 'str' in dir(s) assert 'dt' not in dir(s) assert 'cat' not in dir(s) # similarly for .dt s = Series(date_range('1/1/2015', periods=5)) assert 'dt' in dir(s) assert 'str' not in dir(s) assert 'cat' not in dir(s) # Similarly for .cat, but with the twist that str and dt should be # there if the categories are of that type first cat and str. s = Series(list('abbcd'), dtype="category") assert 'cat' in dir(s) assert 'str' in dir(s) # as it is a string categorical assert 'dt' not in dir(s) # similar to cat and str s = Series(date_range('1/1/2015', periods=5)).astype("category") assert 'cat' in dir(s) assert 'str' not in dir(s) assert 'dt' in dir(s) # as it is a datetime categorical def test_tab_completion_with_categorical(self): # test the tab completion display ok_for_cat = [ 'categories', 'codes', 'ordered', 'set_categories', 'add_categories', 'remove_categories', 'rename_categories', 'reorder_categories', 'remove_unused_categories', 'as_ordered', 'as_unordered' ] def get_dir(s): results = [r for r in s.cat.__dir__() if not r.startswith('_')] return list(sorted(set(results))) s = Series(list('aabbcde')).astype('category') results = get_dir(s) tm.assert_almost_equal(results, list(sorted(set(ok_for_cat)))) @pytest.mark.parametrize("index", [ tm.makeUnicodeIndex(10), tm.makeStringIndex(10), tm.makeCategoricalIndex(10), Index(['foo', 'bar', 'baz'] * 2), tm.makeDateIndex(10), tm.makePeriodIndex(10), tm.makeTimedeltaIndex(10), tm.makeIntIndex(10), tm.makeUIntIndex(10), tm.makeIntIndex(10), tm.makeFloatIndex(10), Index([True, False]), Index(['a{}'.format(i) for i in range(101)]), pd.MultiIndex.from_tuples(lzip('ABCD', 'EFGH')), pd.MultiIndex.from_tuples(lzip([0, 1, 2, 3], 'EFGH')), ]) def test_index_tab_completion(self, index): # dir contains string-like values of the Index. s = pd.Series(index=index) dir_s = dir(s) for i, x in enumerate(s.index.unique(level=0)): if i < 100: assert (not isinstance(x, string_types) or not isidentifier(x) or x in dir_s) else: assert x not in dir_s def test_not_hashable(self): s_empty = Series() s = Series([1]) pytest.raises(TypeError, hash, s_empty) pytest.raises(TypeError, hash, s) def test_contains(self): tm.assert_contains_all(self.ts.index, self.ts) def test_iter(self): for i, val in enumerate(self.series): assert val == self.series[i] for i, val in enumerate(self.ts): assert val == self.ts[i] def test_keys(self): # HACK: By doing this in two stages, we avoid 2to3 wrapping the call # to .keys() in a list() getkeys = self.ts.keys assert getkeys() is self.ts.index def test_values(self): tm.assert_almost_equal(self.ts.values, self.ts, check_dtype=False) def test_iteritems(self): for idx, val in compat.iteritems(self.series): assert val == self.series[idx] for idx, val in compat.iteritems(self.ts): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) assert not hasattr(self.series.iteritems(), 'reverse') def test_items(self): for idx, val in self.series.items(): assert val == self.series[idx] for idx, val in self.ts.items(): assert val == self.ts[idx] # assert is lazy (genrators don't define reverse, lists do) assert not hasattr(self.series.items(), 'reverse') def test_raise_on_info(self): s = Series(np.random.randn(10)) with pytest.raises(AttributeError): s.info() def test_copy(self): for deep in [None, False, True]: s = Series(np.arange(10), dtype='float64') # default deep is True if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) s2[::2] = np.NaN if deep is None or deep is True: # Did not modify original Series assert np.isnan(s2[0]) assert not np.isnan(s[0]) else: # we DID modify the original Series assert np.isnan(s2[0]) assert np.isnan(s[0]) # GH 11794 # copy of tz-aware expected = Series([Timestamp('2012/01/01', tz='UTC')]) expected2 = Series([Timestamp('1999/01/01', tz='UTC')]) for deep in [None, False, True]: s = Series([Timestamp('2012/01/01', tz='UTC')]) if deep is None: s2 = s.copy() else: s2 = s.copy(deep=deep) s2[0] = pd.Timestamp('1999/01/01', tz='UTC') # default deep is True if deep is None or deep is True: # Did not modify original Series assert_series_equal(s2, expected2) assert_series_equal(s, expected) else: # we DID modify the original Series assert_series_equal(s2, expected2) assert_series_equal(s, expected2) def test_axis_alias(self): s = Series([1, 2, np.nan]) assert_series_equal(s.dropna(axis='rows'), s.dropna(axis='index')) assert s.dropna().sum('rows') == 3 assert s._get_axis_number('rows') == 0 assert s._get_axis_name('rows') == 'index' def test_class_axis(self): # https://github.com/pandas-dev/pandas/issues/18147 # no exception and no empty docstring assert pydoc.getdoc(Series.index) def test_numpy_unique(self): # it works! np.unique(self.ts) def test_ndarray_compat(self): # test numpy compat with Series as sub-class of NDFrame tsdf = DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], index=date_range('1/1/2000', periods=1000)) def f(x): return x[x.idxmax()] result = tsdf.apply(f) expected = tsdf.max() tm.assert_series_equal(result, expected) # .item() s = Series([1]) result = s.item() assert result == 1 assert s.item() == s.iloc[0] # using an ndarray like function s = Series(np.random.randn(10)) result = Series(np.ones_like(s)) expected = Series(1, index=range(10), dtype='float64') tm.assert_series_equal(result, expected) # ravel s = Series(np.random.randn(10)) tm.assert_almost_equal(s.ravel(order='F'), s.values.ravel(order='F')) # compress # GH 6658 s = Series([0, 1., -1], index=list('abc')) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.], index=['b'])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Index(dtype=object) as the same as original exp = Series([], dtype='float64', index=Index([], dtype='object')) tm.assert_series_equal(result, exp) s = Series([0, 1., -1], index=[.1, .2, .3]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s > 0, s) tm.assert_series_equal(result, Series([1.], index=[.2])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = np.compress(s < -1, s) # result empty Float64Index as the same as original exp = Series([], dtype='float64', index=Index([], dtype='float64')) tm.assert_series_equal(result, exp) def test_str_attribute(self): # GH9068 methods = ['strip', 'rstrip', 'lstrip'] s = Series([' jack', 'jill ', ' jesse ', 'frank']) for method in methods: expected = Series([getattr(str, method)(x) for x in s.values]) assert_series_equal(getattr(Series.str, method)(s.str), expected) # str accessor only valid with string values s = Series(range(5)) with tm.assert_raises_regex(AttributeError, 'only use .str accessor'): s.str.repeat(2) def test_empty_method(self): s_empty = pd.Series() assert s_empty.empty for full_series in [pd.Series([1]), pd.Series(index=[1])]: assert not full_series.empty def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip('IPython', minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; s = pd.Series()" ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter('ignore'): list(ip.Completer.completions('s.', 1))
def setup_method(self, method): self.bool_index = tm.makeBoolIndex(10, name="a") self.int_index = tm.makeIntIndex(10, name="a") self.float_index = tm.makeFloatIndex(10, name="a") self.dt_index = tm.makeDateIndex(10, name="a") self.dt_tz_index = tm.makeDateIndex( 10, name="a").tz_localize(tz="US/Eastern") self.period_index = tm.makePeriodIndex(10, name="a") self.string_index = tm.makeStringIndex(10, name="a") self.unicode_index = tm.makeUnicodeIndex(10, name="a") arr = np.random.randn(10) self.bool_series = Series(arr, index=self.bool_index, name="a") self.int_series = Series(arr, index=self.int_index, name="a") self.float_series = Series(arr, index=self.float_index, name="a") self.dt_series = Series(arr, index=self.dt_index, name="a") self.dt_tz_series = self.dt_tz_index.to_series() self.period_series = Series(arr, index=self.period_index, name="a") self.string_series = Series(arr, index=self.string_index, name="a") self.unicode_series = Series(arr, index=self.unicode_index, name="a") types = [ "bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode" ] self.indexes = [getattr(self, "{}_index".format(t)) for t in types] self.series = [getattr(self, "{}_series".format(t)) for t in types] # To test narrow dtypes, we use narrower *data* elements, not *index* elements index = self.int_index self.float32_series = Series(arr.astype(np.float32), index=index, name="a") arr_int = np.random.choice(10, size=10, replace=False) self.int8_series = Series(arr_int.astype(np.int8), index=index, name="a") self.int16_series = Series(arr_int.astype(np.int16), index=index, name="a") self.int32_series = Series(arr_int.astype(np.int32), index=index, name="a") self.uint8_series = Series(arr_int.astype(np.uint8), index=index, name="a") self.uint16_series = Series(arr_int.astype(np.uint16), index=index, name="a") self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a") nrw_types = [ "float32", "int8", "int16", "int32", "uint8", "uint16", "uint32" ] self.narrow_series = [ getattr(self, "{}_series".format(t)) for t in nrw_types ] self.objs = self.indexes + self.series + self.narrow_series
def setup(self): np.random.seed(1234) self.uniques = tm.makeStringIndex(1000).values self.all = self.uniques.repeat(10)
import numpy as np import pytest import pandas as pd from pandas.core.indexes.api import Index, MultiIndex import pandas.util.testing as tm indices_dict = { "unicode": tm.makeUnicodeIndex(100), "string": tm.makeStringIndex(100), "datetime": tm.makeDateIndex(100), "period": tm.makePeriodIndex(100), "timedelta": tm.makeTimedeltaIndex(100), "int": tm.makeIntIndex(100), "uint": tm.makeUIntIndex(100), "range": tm.makeRangeIndex(100), "float": tm.makeFloatIndex(100), "bool": Index([True, False]), "categorical": tm.makeCategoricalIndex(100), "interval": tm.makeIntervalIndex(100), "empty": Index([]), "tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])), "repeats": Index([0, 0, 1, 1, 2, 2]), } @pytest.fixture(params=indices_dict.keys()) def indices(request): # copy to avoid mutation, e.g. setting .name return indices_dict[request.param].copy()
[tm.makeIntIndex(10), tm.makeFloatIndex(10), tm.makePeriodIndex(10)]) def test_invalid_index_types(idx): msg = ("(cannot infer freq from a non-convertible)|" "(Check the `freq` attribute instead of using infer_freq)") with pytest.raises(TypeError, match=msg): frequencies.infer_freq(idx) @pytest.mark.skipif(is_platform_windows(), reason="see gh-10822: Windows issue") @pytest.mark.parametrize( "idx", [tm.makeStringIndex(10), tm.makeUnicodeIndex(10)]) def test_invalid_index_types_unicode(idx): # see gh-10822 # # Odd error message on conversions to datetime for unicode. msg = "Unknown string format" with pytest.raises(ValueError, match=msg): frequencies.infer_freq(idx) def test_string_datetime_like_compat(): # see gh-6463 data = ["2004-01", "2004-02", "2004-03", "2004-04"] expected = frequencies.infer_freq(data)
def setup(self, keep): N = 10**5 self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) self.string_idx = tm.makeStringIndex(N)