def _test_stack_with_multiindex(multiindex): df = DataFrame( np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex, ) for level in (-1, 0, 1, [0, 1], [1, 0]): result = df.stack(level=level, dropna=False) if isinstance(level, int): # Stacking a single level should not make any all-NaN rows, # so df.stack(level=level, dropna=False) should be the same # as df.stack(level=level, dropna=True). expected = df.stack(level=level, dropna=True) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: tm.assert_frame_equal(result, expected) df.columns = MultiIndex.from_tuples(df.columns.to_numpy(), names=df.columns.names) expected = df.stack(level=level, dropna=False) if isinstance(expected, Series): tm.assert_series_equal(result, expected) else: tm.assert_frame_equal(result, expected)
def test_stack_mixed_level(self): # GH 18310 levels = [range(3), [3, 'a', 'b'], [1, 2]] # flat columns: df = DataFrame(1, index=levels[0], columns=levels[1]) result = df.stack() expected = Series(1, index=MultiIndex.from_product(levels[:2])) assert_series_equal(result, expected) # MultiIndex columns: df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:])) result = df.stack(1) expected = DataFrame(1, index=MultiIndex.from_product( [levels[0], levels[2]]), columns=levels[1]) assert_frame_equal(result, expected) # as above, but used labels in level are actually of homogeneous type result = df[['a', 'b']].stack(1) expected = expected[['a', 'b']] assert_frame_equal(result, expected)
def test_stack_mixed_levels(self): columns = MultiIndex.from_tuples( [('A', 'cat', 'long'), ('B', 'cat', 'long'), ('A', 'dog', 'short'), ('B', 'dog', 'short')], names=['exp', 'animal', 'hair_length']) df = DataFrame(randn(4, 4), columns=columns) animal_hair_stacked = df.stack(level=['animal', 'hair_length']) exp_hair_stacked = df.stack(level=['exp', 'hair_length']) # GH #8584: Need to check that stacking works when a number # is passed that is both a level name and in the range of # the level numbers df2 = df.copy() df2.columns.names = ['exp', 'animal', 1] assert_frame_equal(df2.stack(level=['animal', 1]), animal_hair_stacked, check_names=False) assert_frame_equal(df2.stack(level=['exp', 1]), exp_hair_stacked, check_names=False) # When mixed types are passed and the ints are not level # names, raise pytest.raises(ValueError, df2.stack, level=['animal', 0]) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth df3 = df.copy() df3.columns.names = ['exp', 'animal', 0] assert_frame_equal(df3.stack(level=['animal', 0]), animal_hair_stacked, check_names=False)
def test_stack_int_level_names(self): columns = MultiIndex.from_tuples( [('A', 'cat', 'long'), ('B', 'cat', 'long'), ('A', 'dog', 'short'), ('B', 'dog', 'short')], names=['exp', 'animal', 'hair_length'] ) df = DataFrame(randn(4, 4), columns=columns) exp_animal_stacked = df.stack(level=['exp', 'animal']) animal_hair_stacked = df.stack(level=['animal', 'hair_length']) exp_hair_stacked = df.stack(level=['exp', 'hair_length']) df2 = df.copy() df2.columns.names = [0, 1, 2] assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False) assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False) assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False) # Out-of-order int column names df3 = df.copy() df3.columns.names = [2, 0, 1] assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False) assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False) assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False)
def test_stack_mixed_levels(self): columns = MultiIndex.from_tuples( [('A', 'cat', 'long'), ('B', 'cat', 'long'), ('A', 'dog', 'short'), ('B', 'dog', 'short')], names=['exp', 'animal', 'hair_length'] ) df = DataFrame(randn(4, 4), columns=columns) animal_hair_stacked = df.stack(level=['animal', 'hair_length']) exp_hair_stacked = df.stack(level=['exp', 'hair_length']) # GH #8584: Need to check that stacking works when a number # is passed that is both a level name and in the range of # the level numbers df2 = df.copy() df2.columns.names = ['exp', 'animal', 1] assert_frame_equal(df2.stack(level=['animal', 1]), animal_hair_stacked, check_names=False) assert_frame_equal(df2.stack(level=['exp', 1]), exp_hair_stacked, check_names=False) # When mixed types are passed and the ints are not level # names, raise pytest.raises(ValueError, df2.stack, level=['animal', 0]) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth df3 = df.copy() df3.columns.names = ['exp', 'animal', 0] assert_frame_equal(df3.stack(level=['animal', 0]), animal_hair_stacked, check_names=False)
def test_stack_int_level_names(self): columns = MultiIndex.from_tuples( [('A', 'cat', 'long'), ('B', 'cat', 'long'), ('A', 'dog', 'short'), ('B', 'dog', 'short')], names=['exp', 'animal', 'hair_length']) df = DataFrame(randn(4, 4), columns=columns) exp_animal_stacked = df.stack(level=['exp', 'animal']) animal_hair_stacked = df.stack(level=['animal', 'hair_length']) exp_hair_stacked = df.stack(level=['exp', 'hair_length']) df2 = df.copy() df2.columns.names = [0, 1, 2] assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False) assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False) assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False) # Out-of-order int column names df3 = df.copy() df3.columns.names = [2, 0, 1] assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False) assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False) assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False)
def test_stack_mixed_levels(self): columns = MultiIndex.from_tuples( [("A", "cat", "long"), ("B", "cat", "long"), ("A", "dog", "short"), ("B", "dog", "short")], names=["exp", "animal", "hair_length"], ) df = DataFrame(randn(4, 4), columns=columns) animal_hair_stacked = df.stack(level=["animal", "hair_length"]) exp_hair_stacked = df.stack(level=["exp", "hair_length"]) # GH #8584: Need to check that stacking works when a number # is passed that is both a level name and in the range of # the level numbers df2 = df.copy() df2.columns.names = ["exp", "animal", 1] assert_frame_equal(df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False) assert_frame_equal(df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False) # When mixed types are passed and the ints are not level # names, raise self.assertRaises(ValueError, df2.stack, level=["animal", 0]) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth df3 = df.copy() df3.columns.names = ["exp", "animal", 0] assert_frame_equal(df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False)
def ler_base_fechamento(path, tipo, nome): nome = '01_fechamento' file = path + nome + tipo fechamento = read_csv(file, sep=';') fechamento = fechamento.set_index('codigo') datas = fechamento.columns.values.tolist() tikers = fechamento.index.values.tolist() values = fechamento.values fechamento = DataFrame(values, index=tikers, columns=datas, dtype='float64') fechamento = fechamento.stack() nome = '09_lpa' file = path + nome + tipo lpa = read_csv(file, sep=';') lpa = lpa.set_index('codigo') datas = lpa.columns.values.tolist() tikers = lpa.index.values.tolist() values = lpa.values lpa = DataFrame(values, index=tikers, columns=datas, dtype='float64') lpa = lpa.stack() lpa = lpa.unstack() dados_ibov = fechamento.join(lpa, how='outer')
def cal_SMB_HML(ret, size, BM, percentile1=None, percentile2=None, independent=True, exclude_30_small_size=False): if exclude_30_small_size: size = ClipQuantile(size, [0.0, 0.3, 1.0], [-1.0, 1.0]) ret, size, BM = IndexAlign(ret, size, BM) valid_ = ~pd.isnull( BM + ret + size ) # TypeError: bad operand type for unary ~: 'float'--->index或columns不匹配 size = size[valid_] BM = BM[valid_] ret = ret[valid_] if percentile1 is None: percentile1 = [0.0, 0.5, 1.0] # size percentile2 = [0.0, 0.3, 0.7, 1.0] # value label_1 = [i + 1 for i in range(len(percentile1) - 1)] label_2 = [i + 1 for i in range(len(percentile2) - 1)] if independent: #mark_1 = pd.DataFrame([pd.qcut(size.iloc[i], q=percentile1, labels=label_1) for i in size.index[:-1]], # index=size.index[:-1]) # 报错 mark_1 = DataFrame([ qcut(size.loc[i], q=percentile1, labels=label_1) for i in size.index ]) mark_2 = DataFrame([ qcut(BM.loc[i], q=percentile2, labels=label_2) for i in BM.index ]) # indi已经shift(1)了,也就是其时间index与holding period of portfolio是一致的 else: mark_1 = DataFrame([ qcut(size.loc[i], q=percentile1, labels=label_1) for i in size.index ]) # indi已经shift(1)了,也就是其时间index与holding period of portfolio是一致的 mark_2 = DataFrame(index=mark_1.index, columns=mark_1.columns) for l_ in label_1: tmp = DataFrame([ qcut(BM.loc[i][mark_1.iloc[i] == l_], q=percentile2, labels=label_2) for i in BM.index ]) mark_2 = mark_2.combine_first(tmp) #valid_ = ~(pd.isnull(mark_1 + mark_2) | pd.isnull(ret.iloc[1:])) # valid的股票要满足:当期有前一个月的indicator信息;当期保证交易 df = DataFrame() df['rtn'] = ret.stack() df['ref1'] = mark_1.stack() df['ref2'] = mark_2.stack() tmp = df.groupby(level=0).apply( lambda g: g.groupby(['ref1', 'ref2']).mean()).unstack()['rtn'] #tmp.columns = tmp.columns.get_level_values(1) tmp.index.names = ('trddt', 'ref1') HML = tmp.mean(axis=0, level=0) SMB = tmp.mean(axis=1).unstack() return SMB.iloc[:, -1] - SMB.iloc[:, 0], HML.iloc[:, -1] - HML.iloc[:, 0]
def test_stack_ints(self): columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3))) df = DataFrame(np.random.randn(30, 27), columns=columns) assert_frame_equal(df.stack(level=[1, 2]), df.stack(level=1).stack(level=1)) assert_frame_equal(df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1)) df_named = df.copy() df_named.columns.set_names(range(3), inplace=True) assert_frame_equal(df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1))
def cal_SMB_HML_FF(ret, EndDate, size=None, book=None, weights=None): # TODO return要和EndDate的频率保持一致 percentile1 = [0.0, 0.5, 1.0] # size percentile2 = [0.0, 0.3, 0.7, 1.0] # value label_1 = [i + 1 for i in range(len(percentile1) - 1)] label_2 = [i + 1 for i in range(len(percentile2) - 1)] size, book = import_data(PV_vars=['size_tot'], BS_vars=['tot_shrhldr_eqy_excl_min_int'])[:2] BE = book.drop(book.index[book.index.duplicated(keep='last')]).unstack() BE = BE[BE.index.month == 12] BE = BE[BE > 0] size = size.unstack() ME = size.copy() ME = ME.resample('M').last() ME6 = ME[ME.index.month == 6] ME12 = ME[ME.index.month == 12] ME12.loc[parse('20041231')] = size.loc['2005-01-04'] ME12 = ME12.sort_index() BM = BE.reindex(index=ME12.index, columns=ME12.columns) / ME12 mark_1 = DataFrame( [qcut(ME6.loc[i], q=percentile1, labels=label_1) for i in ME6.index]) mark_1.index = mark_1.index + Day() mark_1 = mark_1.resample('D').ffill().reindex(index=EndDate) mark_2 = DataFrame( [qcut(BM.loc[i], q=percentile2, labels=label_2) for i in BM.index]) mark_2.index = mark_2.index + MonthBegin(7) mark_2 = mark_2.resample('D').ffill().reindex(index=EndDate) if weights is None: df = DataFrame() df['ret'] = ret.stack() df['ref1'] = mark_1.stack() df['ref2'] = mark_2.stack() df = df.dropna() tmp = df.groupby(level=0).apply( lambda g: g.groupby(['ref1', 'ref2']).mean())['ret'].unstack() else: weights = size.resample('D').ffill().reindex(index=EndDate).shift(1) df = DataFrame() df['ret'] = (ret * weights).stack() df['ref1'] = mark_1.stack() df['ref2'] = mark_2.stack() df['w'] = weights.stack() df = df.dropna() tmp1 = df.groupby( level=0).apply(lambda g: g.groupby(['ref1', 'ref2']).sum())['ret'] tmp2 = df.groupby( level=0).apply(lambda g: g.groupby(['ref1', 'ref2']).sum())['w'] tmp = (tmp1 / tmp2).unstack() rHML = tmp.mean(axis=0, level=0) rSMB = tmp.mean(axis=1).unstack() return rSMB.iloc[:, -1] - rSMB.iloc[:, 0], rHML.iloc[:, -1] - rHML.iloc[:, 0]
def test_stack_ints(self): df = DataFrame(np.random.randn(30, 27), columns=MultiIndex.from_tuples( list(itertools.product(range(3), repeat=3)))) assert_frame_equal(df.stack(level=[1, 2]), df.stack(level=1).stack(level=1)) assert_frame_equal(df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1)) df_named = df.copy() df_named.columns.set_names(range(3), inplace=True) assert_frame_equal(df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1))
def get_corrs(z_sc_df: pd.DataFrame, merged_df: pd.DataFrame) -> pd.DataFrame: logger.info('Getting available hgnc symbols from correlation matrix') corr_symb_set = set(z_sc_df.columns.values) logger.info('Stacking the correlation matrix: may take a couple of ' 'minutes and tens of GiB of memory') stacked_z_sc_df = z_sc_df.stack(dropna=True).to_frame( name='z_score', ).reset_index().rename(columns={ 'level_0': 'agA_name', 'level_1': 'agB_name' }) # Merge in stacked correlations to the sif df logger.info('Getting relevant correlations') z_corr_pairs = merged_df[['agA_name', 'agB_name']].merge(right=stacked_z_sc_df, how='left').drop_duplicates() # z_score: original z-score or 0 if nonexistant z_corr_pairs.loc[z_corr_pairs.z_score.isna(), 'z_score'] = 0 # Get self correlation self_corr = z_sc_df.iloc[0, 0] assert isinstance(self_corr, (int, float)) and self_corr > 0 # Calculate corr weight = (self_corr_z_sc - abs(z_score)) / self_corr z_corr_pairs['corr_weight'] = z_sc_weight_df(z_corr_pairs, self_corr) logger.info('Finished setting z-score and z-score weight in sif df') return z_corr_pairs
def chi(self, customattribute): """ 计算其卡方值. """ attributeDict = dict() classAttributeDict = dict() for piece in self.chunks: for (attribute, classAttribute), arrays in piece.groupby([customattribute, self.classAttribute]).studentID.unique().iteritems(): attributeDict.setdefault((attribute, classAttribute), np.array([])) attributeDict[(attribute, classAttribute)] = np.union1d(attributeDict[(attribute, classAttribute)], arrays) for classAttribute, arrays in piece.groupby(self.classAttribute).studentID.unique().iteritems(): classAttributeDict.setdefault(classAttribute, np.array([])) classAttributeDict[classAttribute] = np.union1d(classAttributeDict[classAttribute], arrays) #各个类别的毕业去向群体中所占的比例. classSeries = Series(classAttributeDict).apply(lambda x:len(x)) classSeries /= classSeries.sum() #在各个attribute上的实际观测值. attributeObs = Series(attributeDict).apply(lambda x:len(x)).unstack(fill_value=0) attributeExp = DataFrame(index=attributeObs.index, columns=attributeObs.columns) #设置初始值. for index in attributeExp.index: attributeExp.ix[index] = attributeObs.ix[index].sum() #根据各个目标类别中的比例来获得其期望值. attributeExp = attributeExp.mul(classSeries).fillna(0) #根据实际观测值与期望值来计算其卡方值,并返回p-value值. return chisquare(attributeObs.stack(), attributeExp.stack()), attributeObs
def pandas_reshape_pivot_part1(): # 重塑层次化索引 # stack: 将数据的列"旋转"为行; unstack: 将数据的行"旋转"为列 data = DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index(['Ohio', 'Colorado'], name='state'), columns=pd.Index(['one', 'two', 'three'], name='number')) # stack()是将原来的列索引转成了最内层的行索引,把df转换为series result = data.stack() # print(result) # unstack()最内层的行索引还原成了列索引,把series转换为df # print(result.unstack()) # 默认stack(),unstack()操作的是最内层,也可以对指定分层级别进行操作 # print(result.unstack(0)) # 最外层编号0,依次增加 # print(result.unstack('state')) s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd']) s2 = Series([4, 5, 6], index=['c', 'd', 'e']) # data2 = pd.concat([s1, s2], keys=['one', 'two']) # print(data2.unstack()) # 对于级别值在分组中找不到数据,则引入Nan df = DataFrame({ 'left': result, 'right': result + 5 }, columns=pd.Index(['left', 'right'], names='side')) print(df)
def clean_correlations(df_cor: pd.DataFrame) -> pd.DataFrame: """ Function to stack correlation matrix, remove duplicate entries and sort the values. :param df_cor: (pd.DataFrame) Correlation matrix :return: pd.DataFrame as stacked correlation matrix """ # stack them to make it easier to go through df_cor = df_cor.stack().reset_index() df_cor.columns = ["Var_1", "Var_2", "Correlation"] # want to through out duplicates df_cor["index"] = df_cor.apply(lambda x: _return_index(x), axis=1) df_mean = df_cor.groupby("index").mean().reset_index() df_mean.rename(columns={"Correlation": "Cor_mean"}, inplace=True) df_cor = pd.merge(df_cor, df_mean, how="left", on="index") df_cor = df_cor[(df_cor["Var_1"] < df_cor["Var_2"]) | (np.abs(df_cor["Correlation"] - df_cor["Cor_mean"]) > 0.000001)] # drop the mean column (only used to find duplicates) df_cor.drop(["index", "Cor_mean"], axis=1, inplace=True) df_cor.sort_values(by="Correlation", ascending=False, inplace=True) return df_cor
def _calc_firing_rate(self, num_peaks: pd.DataFrame, epoch: str = "All_cells"): """ Sum all indices of peaks to find the average firing rate of cells in the three epochs :return: """ # Remove silent cells from comparison split_data = num_peaks.stack() mc = MultiComparison(split_data.values, split_data.index.get_level_values(1).values) try: res = mc.tukeyhsd() except ValueError: aprint("<yellow>Failed during the p-value calculation.</yellow>") else: print(res) print( f"P-values ({epoch}, number of cells: {split_data.shape[0] // 3}):", psturng( np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total, ), ) finally: print(split_data.mean(level=1))
def plot_cond_prob(p_A_given_B: pd.DataFrame): # Get the name of the index column B = p_A_given_B.index.name # If the B column is numeric if p_A_given_B.index.is_numeric(): return p_A_given_B.reset_index().plot.area(x=B, figsize=(15, 10)) # If categorical or string elif p_A_given_B.index.is_object() | p_A_given_B.index.is_categorical(): # What is the name of the A column? temp = p_A_given_B.stack().rename("prob").reset_index() A_name = np.setdiff1d(temp.columns, [B, "prob"])[0] # Create the plot return (p_A_given_B.stack().rename("prob").unstack(A_name).plot.barh( figsize=(15, 10), stacked=True))
def player_data_classifier(attr, reverse=0): main_data = pd.read_csv('data.csv') subtable_kill = main_data['player1_' + attr] + main_data[ 'player2_' + attr] + main_data['player3_' + attr] + main_data[ 'player4_' + attr] + main_data['player5_' + attr] subtable_kill.name = 'player_' + attr subtable = DataFrame([main_data['team1_win'], subtable_kill]) subtable = subtable.stack().unstack(0) subtable.index.name = 'index' m = min(subtable_kill) M = max(subtable_kill) max_accuracy = 0 max_i = m def no(x): if (x == 0): return 1 else: return 0 for i in range(m, M + 1): if (M - m >= 40): if ((i - m) % 100 == 0): print('This is the ', i - m, '/', M - m, 'th iteration.') cross_entropy = 0 tmp1 = subtable['player_' + attr] >= i tmp2 = subtable['player_' + attr] < i u = subtable['team1_win'][tmp1] v = subtable['team1_win'][tmp2] if (reverse == 0): accuracy = (u.sum() + v.apply(no).sum()) / len(subtable.index) else: accuracy = (v.sum() + u.apply(no).sum()) / len(subtable.index) if accuracy > max_accuracy: max_accuracy = accuracy max_i = i print('max_accuracy=', max_accuracy) print('corresponding classifier i=', max_i) tmp = subtable[subtable['player_' + attr] >= max_i]['team1_win'].value_counts() if (reverse == 0): tmp.index = ['team1_win', 'team1_loss'] else: tmp.index = ['team1_loss', 'team1_win'] plt.title('player_' + attr + '>=' + str(max_i)) tmp.plot(kind='barh') plt.show() print('player_' + attr + '>=', max_i, '对应的玩家胜负情况统计') print(tmp) tmp = subtable[subtable['player_' + attr] < max_i]['team1_win'].value_counts() if (reverse == 0): tmp.index = ['team1_loss', 'team1_win'] else: tmp.index = ['team1_win', 'team1_loss'] plt.title('player_' + attr + '<' + str(max_i)) tmp.plot(kind='barh') plt.show() print('player_' + attr + '<', max_i, '对应的玩家胜负情况统计') print(tmp)
def interp_to_obs(var, df, lat, lon, radius=12000.): """Short summary. Parameters ---------- var : type Description of parameter `var`. df : type Description of parameter `df`. lat : type Description of parameter `lat`. lon : type Description of parameter `lon`. radius : type Description of parameter `radius` (the default is 12000.). Returns ------- type Description of returned object. """ from numpy import NaN, vstack from pyresample import geometry, image from pandas import to_timedelta, DataFrame # define CMAQ pyresample grid (source) grid1 = geometry.GridDefinition(lons=lon, lats=lat) # get unique sites from df dfn = df.drop_duplicates(subset=['Latitude', 'Longitude']) # define site grid (target) lats = dfn.Latitude.values lons = dfn.Longitude.values grid2 = geometry.GridDefinition(lons=vstack(lons), lats=vstack(lats)) # Create image container i = image.ImageContainerNearest(var.transpose('y', 'x', 'time').values, grid1, radius_of_influence=radius, fill_value=NaN) # resample ii = i.resample(grid2).image_data.squeeze() # recombine data e = DataFrame(ii, index=dfn.SCS, columns=var.time.values) w = e.stack().reset_index().rename(columns={ 'level_1': 'datetime', 0: 'model' }) w = w.merge(dfn.drop(['datetime', 'datetime_local', 'Obs'], axis=1), on='SCS', how='left') w = w.merge(df[['datetime', 'SCS', 'Obs']], on=['SCS', 'datetime'], how='left') # calculate datetime local w['datetime_local'] = w.datetime + to_timedelta(w.utcoffset, 'H') return w
def _plot_wcorr(Wcorr, L): f = figure(tools="box_select, pan, reset, save") f.plot_width = 700 f.plot_height = 600 # Background settings f.background_fill_color = '#859dcd' f.background_fill_alpha = 0.05 # Title settings f.title.text = "W-Correlation for L={}".format(L) f.title.text_font = 'Helvetica' f.title.text_font_size = '24px' f.title.align = 'center' f.title.text_font_style = "italic" # Axis settings f.xaxis.axis_label = 'Fⱼ' f.yaxis.axis_label = 'Fᵢ' f.axis.axis_label_text_font = 'Helvetica' f.axis.axis_label_text_font_size = '24px' f.axis.major_label_orientation = 0 f.x_range = Range1d(start=0.5, end=L + 0.5) f.y_range = Range1d(start=L + 0.5, end=0.5) f.axis[0].ticker.desired_num_ticks = L f.axis[0].ticker.num_minor_ticks = 0 data = DataFrame(Wcorr) axis = [i for i in range(1, Wcorr.shape[0] + 1)] data['F_i'] = axis data.set_index('F_i', inplace=True) data.columns = axis data.columns.name = 'F_j' df = DataFrame(data.stack(), columns=['corr']).reset_index() source = ColumnDataSource(df) # this is the colormap from the original NYTimes plot mapper = LinearColorMapper(palette=color.palettes['colors_2'], low=0, high=1) f.rect(x="F_i", y="F_j", width=1, height=1, source=source, line_color=None, fill_color=transform('corr', mapper)) color_bar = ColorBar(color_mapper=mapper, location=(0, 0), ticker=BasicTicker(desired_num_ticks=len(color.palettes['colors_2'])), formatter=PrintfTickFormatter(format="%.2f")) f.add_layout(color_bar, 'right') hover = HoverTool(tooltips=[("Components", "(@F_i, @F_j)"), ("Correlations", "@corr")]) f.add_tools(hover) show(f)
def test_stack_partial_multiIndex(self): # GH 8844 def _test_stack_with_multiindex(multiindex): df = DataFrame(np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex) for level in (-1, 0, 1, [0, 1], [1, 0]): result = df.stack(level=level, dropna=False) if isinstance(level, int): # Stacking a single level should not make any all-NaN rows, # so df.stack(level=level, dropna=False) should be the same # as df.stack(level=level, dropna=True). expected = df.stack(level=level, dropna=True) if isinstance(expected, Series): assert_series_equal(result, expected) else: assert_frame_equal(result, expected) df.columns = MultiIndex.from_tuples(df.columns.get_values(), names=df.columns.names) expected = df.stack(level=level, dropna=False) if isinstance(expected, Series): assert_series_equal(result, expected) else: assert_frame_equal(result, expected) full_multiindex = MultiIndex.from_tuples( [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")], names=["Upper", "Lower"] ) for multiindex_columns in ( [0, 1, 2, 3, 4], [0, 1, 2, 3], [0, 1, 2, 4], [0, 1, 2], [1, 2, 3], [2, 3, 4], [0, 1], [0, 2], [0, 3], [0], [2], [4], ): _test_stack_with_multiindex(full_multiindex[multiindex_columns]) if len(multiindex_columns) > 1: multiindex_columns.reverse() _test_stack_with_multiindex(full_multiindex[multiindex_columns]) df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]]) result = df.stack(dropna=False) expected = DataFrame( [[0, 2], [1, nan], [3, 5], [4, nan]], index=MultiIndex( levels=[[0, 1], ["u", "x", "y", "z"]], labels=[[0, 0, 1, 1], [1, 3, 1, 3]], names=[None, "Lower"] ), columns=Index(["B", "C"], name="Upper"), dtype=df.dtypes[0], ) assert_frame_equal(result, expected)
def find_closest_from_another(table: pd.DataFrame) -> pd.Series: """ Returns as Series of tuples: species | (seqid_of_closest, species_of_closest, seqid_of_self) """ table = table.copy() for lbl in table.index.levels[1]: table.loc[(slice(None), lbl), (slice(None), lbl)] = np.nan return table.stack(level=0).idxmin()
def stack_on_colnames( dframe: pd.DataFrame, sep: str = "@", stackcolname: str = "DATE", inplace: bool = True, ) -> pd.DataFrame: """For a dataframe where some columns are multilevel, but where the second level is encoded in the column name, this function will stack the dataframe by putting the second level of the column multiindex into its own column, best understood by this example: A dframe like this ===== =============== ============== PORV OWC@2000-01-01 OWC@2020-01-01 ===== =============== ============== 100 1000 990 ===== =============== ============== will be stacked to ==== ==== ========== PORV OWC DATE ==== ==== ========== 100 1000 2000-01-01 100 990 2020-01-01 ==== ==== ========== (for the defaults values for *sep* and *stackcolname*) Column order is not guaranteed Args: dframe: A dataframe to stack sep: The separator that is used in dframe.columns to define the multilevel column names. stackcolname: Used as column name for the second level of the column multiindex """ if not inplace: dframe = dframe.copy() tuplecolumns = list(map(lambda x: tuple(x.split(sep)), dframe.columns)) if max(map(len, tuplecolumns)) < 2: logger.info("No columns to stack") return dframe dframe.columns = pd.MultiIndex.from_tuples(tuplecolumns, names=["dummy", stackcolname]) dframe = dframe.stack() staticcols = [col[0] for col in tuplecolumns if len(col) == 1] dframe[staticcols] = dframe[staticcols].fillna(method="ffill") dframe.reset_index(inplace=True) # Drop rows stemming from the NaNs in the second tuple-element for # static columns: dframe.dropna(axis="index", subset=["DATE"], inplace=True) del dframe["level_0"] dframe.index.name = "" return dframe
def get_factor_data( factor: pd.DataFrame, price_data: pd.DataFrame, periods: Optional[List[int]] = None, split: Union[int, Sequence[float]] = 3, long_short: bool = False, leverage: float = 1, name: str = "", ) -> pd.DataFrame: """Return merged data: factor values, quantiles, weights and returns.""" prices = price_data.xs("close", axis=1, level=1).filter(factor.columns) if factor.index.tz != prices.index.tz: raise ValueError("The time zone of `factor` and `prices` don't match.") factor.loc[datetime.now(timezone.utc)] = float("nan") factor.replace([float("-inf"), float("inf")], float("nan"), inplace=True) factor = factor.resample(prices.index.freq).ffill()[prices.index[0] :] periods = [1] if not periods else [1] + sorted(periods) deltas = [period * prices.index.to_series().diff().mode() for period in periods] deltas = [ ( delta.to_string(index=False).replace(":", "h", 1).replace(":", "m") + "s" ).replace(" dayss", "D") for delta in deltas ] forward_returns = { delta: -prices.diff(-period) / prices for period, delta in dict(zip(periods, deltas)).items() } index = factor.index.intersection(prices.index) factor_data = pd.concat(forward_returns, axis=1).reindex(index).stack() factor_data["factor"] = factor.stack() if isinstance(split, int): factor_quantile = 1 + factor_data.groupby(level=0)["factor"].transform( lambda x: pd.qcut(x, split, labels=False, duplicates="drop") ) elif isinstance(split, (list, tuple, set)): factor_quantile = 1 + factor_data.groupby(level=0)["factor"].transform( lambda x: pd.cut(x, split, labels=False, duplicates="drop") ) split = len(split) - 1 else: raise ValueError(f"Factor `{name}` split type {type(split)} is not supported.") factor_data["factor_quantile"] = factor_quantile quantiles = [1, split] if long_short else list(range(1, split + 1)) factor_data["weights"] = ( factor_data[factor_data["factor_quantile"].isin(quantiles)] .groupby(level=0)["factor"] .transform(lambda x: (x - x.mean()) / (x - x.mean()).abs().sum()) ) factor_data["weights"].fillna(0, inplace=True) for period in forward_returns: factor_data[f"{name}_{period}"] = ( factor_data["weights"] * factor_data[period] * leverage ) factor_data.rename_axis(index=["date", "asset"], inplace=True) factor_data.name = name return factor_data
def _yearly_to_monthly_records(df: pd.DataFrame) -> pd.DataFrame: """Converts an EIA 923 record of 12 months of data into 12 monthly records. Much of the data reported in EIA 923 is monthly, but all 12 months worth of data is reported in a single record, with one field for each of the 12 months. This function converts these annualized composite records into a set of 12 monthly records containing the same information, by parsing the field names for months, and adding a month field. Non - time series data is retained in the same format. Args: df: A pandas DataFrame containing the annual data to be converted into monthly records. Returns: A dataframe containing the same data as was passed in via df, but with monthly records as rows instead of as columns. """ month_dict = { 'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12 } multi_idx = df.columns.str.rsplit("_", n=1, expand=True).set_names( [None, 'report_month']) ends_with_month_filter = multi_idx.get_level_values('report_month').isin( set(month_dict.keys())) if not ends_with_month_filter.any(): return df index_cols = df.columns[~ends_with_month_filter] # performance note: this was good enough for eia923 data size. # Using .set_index() is simple but inefficient due to unecessary index creation. # Performance may be improved by separating into two dataframes, # .stack()ing the monthly data, then joining back together on the original index. df = df.set_index(list(index_cols), append=True) # convert month names to numbers (january -> 1) col_df = multi_idx[ends_with_month_filter].to_frame(index=False) col_df.loc[:, 'report_month'] = col_df.loc[:, 'report_month'].map(month_dict) month_idx = pd.MultiIndex.from_frame(col_df).set_names( [None, 'report_month']) # reshape df.columns = month_idx df = df.stack() # restore original index and columns - reset index except level 0 df = df.reset_index(level=list(range(1, df.index.nlevels))) return df
def test_stack_mixed_levels(self): columns = MultiIndex.from_tuples( [ ("A", "cat", "long"), ("B", "cat", "long"), ("A", "dog", "short"), ("B", "dog", "short"), ], names=["exp", "animal", "hair_length"], ) df = DataFrame(np.random.randn(4, 4), columns=columns) animal_hair_stacked = df.stack(level=["animal", "hair_length"]) exp_hair_stacked = df.stack(level=["exp", "hair_length"]) # GH #8584: Need to check that stacking works when a number # is passed that is both a level name and in the range of # the level numbers df2 = df.copy() df2.columns.names = ["exp", "animal", 1] tm.assert_frame_equal( df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False ) tm.assert_frame_equal( df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False ) # When mixed types are passed and the ints are not level # names, raise msg = ( "level should contain all level names or all level numbers, not " "a mixture of the two" ) with pytest.raises(ValueError, match=msg): df2.stack(level=["animal", 0]) # GH #8584: Having 0 in the level names could raise a # strange error about lexsort depth df3 = df.copy() df3.columns.names = ["exp", "animal", 0] tm.assert_frame_equal( df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False )
def test_stack_datetime_column_multiIndex(self): # GH 8039 t = datetime(2014, 1, 1) df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")])) result = df.stack() eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) ecols = MultiIndex.from_tuples([(t, "A")]) expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) tm.assert_frame_equal(result, expected)
def test_stack_datetime_column_multiIndex(self): # GH 8039 t = datetime(2014, 1, 1) df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")])) result = df.stack() eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)]) ecols = MultiIndex.from_tuples([(t, "A")]) expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols) assert_frame_equal(result, expected)
def test_stack_preserve_categorical_dtype_values(self): # GH-23077 cat = pd.Categorical(["a", "a", "b", "c"]) df = DataFrame({"A": cat, "B": cat}) result = df.stack() index = pd.MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]]) expected = Series(pd.Categorical( ["a", "a", "a", "a", "b", "b", "c", "c"]), index=index) tm.assert_series_equal(result, expected)
def test_stack_partial_multiIndex(self): # GH 8844 def _test_stack_with_multiindex(multiindex): df = DataFrame(np.arange(3 * len(multiindex)).reshape( 3, len(multiindex)), columns=multiindex) for level in (-1, 0, 1, [0, 1], [1, 0]): result = df.stack(level=level, dropna=False) if isinstance(level, int): # Stacking a single level should not make any all-NaN rows, # so df.stack(level=level, dropna=False) should be the same # as df.stack(level=level, dropna=True). expected = df.stack(level=level, dropna=True) if isinstance(expected, Series): assert_series_equal(result, expected) else: assert_frame_equal(result, expected) df.columns = MultiIndex.from_tuples(df.columns.get_values(), names=df.columns.names) expected = df.stack(level=level, dropna=False) if isinstance(expected, Series): assert_series_equal(result, expected) else: assert_frame_equal(result, expected) full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'), ('A', 'y'), ('C', 'x'), ('C', 'u')], names=['Upper', 'Lower']) for multiindex_columns in ([0, 1, 2, 3, 4], [0, 1, 2, 3], [0, 1, 2, 4], [0, 1, 2], [1, 2, 3], [2, 3, 4], [0, 1], [0, 2], [0, 3], [0], [2], [4]): _test_stack_with_multiindex(full_multiindex[multiindex_columns]) if len(multiindex_columns) > 1: multiindex_columns.reverse() _test_stack_with_multiindex( full_multiindex[multiindex_columns]) df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]]) result = df.stack(dropna=False) expected = DataFrame([[0, 2], [1, nan], [3, 5], [4, nan]], index=MultiIndex(levels=[[0, 1], ['u', 'x', 'y', 'z']], labels=[[0, 0, 1, 1], [1, 3, 1, 3]], names=[None, 'Lower']), columns=Index(['B', 'C'], name='Upper'), dtype=df.dtypes[0]) assert_frame_equal(result, expected)
def plot_probs(probs: DataFrame, weight): axf = AxesFormatter() data = probs.stack( level=['likelihood', 'prior']).rename('posterior').reset_index() boxplot(data=data, x='likelihood', y='posterior', hue='prior') axf.rotate_x_tick_labels(90) axf.set_y_lim(0, 1.05) axf.set_axis_below().grid() axf.set_text(title=str(weight)) axf.show()
def R2(y_df: pd.DataFrame, y_hat_df: pd.DataFrame) -> float: ''' R-squared OOS Args ---------- y_df: pd.DataFrame actual test data OOS y_hat_df: pd.DataFrame predicted values OOS Returns ---------- R2: float ''' residuals = y_df - y_hat_df SSR = (residuals**2).stack().sum() SST = y_df.stack().var() * (len(y_df.stack()) - 1) R2 = 1 - SSR / SST return R2
def correct_pvalues(df: pd.DataFrame) -> pd.DataFrame: """ Performs Bonferroni Correction :param df: :return: """ df = df.stack() pvalues = multipletests(df, method='bonferroni')[1] df = pd.Series(pvalues, index=df.index) df = df.unstack() return df
def test_stack_multi_columns_non_unique_index(self, index, columns): # GH-28301 df = DataFrame(index=index, columns=columns).fillna(1) stacked = df.stack() new_index = pd.MultiIndex.from_tuples(stacked.index.to_numpy()) expected = DataFrame(stacked.to_numpy(), index=new_index, columns=stacked.columns) tm.assert_frame_equal(stacked, expected) stacked_codes = np.asarray(stacked.index.codes) expected_codes = np.asarray(new_index.codes) tm.assert_numpy_array_equal(stacked_codes, expected_codes)
def _test_stack_with_multiindex(multiindex): df = DataFrame(np.arange(3 * len(multiindex)).reshape(3, len(multiindex)), columns=multiindex) for level in (-1, 0, 1, [0, 1], [1, 0]): result = df.stack(level=level, dropna=False) if isinstance(level, int): # Stacking a single level should not make any all-NaN rows, # so df.stack(level=level, dropna=False) should be the same # as df.stack(level=level, dropna=True). expected = df.stack(level=level, dropna=True) if isinstance(expected, Series): assert_series_equal(result, expected) else: assert_frame_equal(result, expected) df.columns = MultiIndex.from_tuples(df.columns.get_values(), names=df.columns.names) expected = df.stack(level=level, dropna=False) if isinstance(expected, Series): assert_series_equal(result, expected) else: assert_frame_equal(result, expected)
def test_stack_preserve_categorical_dtype(self, ordered, labels): # GH13854 cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() # `MultiIndex.from_product` preserves categorical dtype - # it's tested elsewhere. midx = pd.MultiIndex.from_product([df.index, cidx]) expected = Series([10, 11, 12], index=midx) tm.assert_series_equal(result, expected)
def test_stack_int_level_names(self): columns = MultiIndex.from_tuples( [ ("A", "cat", "long"), ("B", "cat", "long"), ("A", "dog", "short"), ("B", "dog", "short"), ], names=["exp", "animal", "hair_length"], ) df = DataFrame(np.random.randn(4, 4), columns=columns) exp_animal_stacked = df.stack(level=["exp", "animal"]) animal_hair_stacked = df.stack(level=["animal", "hair_length"]) exp_hair_stacked = df.stack(level=["exp", "hair_length"]) df2 = df.copy() df2.columns.names = [0, 1, 2] tm.assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False) tm.assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False) tm.assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False) # Out-of-order int column names df3 = df.copy() df3.columns.names = [2, 0, 1] tm.assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False) tm.assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False) tm.assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False)
def test_stack_preserve_categorical_dtype(self, ordered, labels): # GH13854 cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered) df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() # `MutliIndex.from_product` preserves categorical dtype - # it's tested elsewhere. midx = pd.MultiIndex.from_product([df.index, cidx]) expected = Series([10, 11, 12], index=midx) tm.assert_series_equal(result, expected)
def test_stack_int_level_names(self): columns = MultiIndex.from_tuples( [("A", "cat", "long"), ("B", "cat", "long"), ("A", "dog", "short"), ("B", "dog", "short")], names=["exp", "animal", "hair_length"], ) df = DataFrame(randn(4, 4), columns=columns) exp_animal_stacked = df.stack(level=["exp", "animal"]) animal_hair_stacked = df.stack(level=["animal", "hair_length"]) exp_hair_stacked = df.stack(level=["exp", "hair_length"]) df2 = df.copy() df2.columns.names = [0, 1, 2] assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False) assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False) assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False) # Out-of-order int column names df3 = df.copy() df3.columns.names = [2, 0, 1] assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False) assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False) assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False)
def test_stack_mixed_level(self): # GH 18310 levels = [range(3), [3, 'a', 'b'], [1, 2]] # flat columns: df = DataFrame(1, index=levels[0], columns=levels[1]) result = df.stack() expected = Series(1, index=MultiIndex.from_product(levels[:2])) assert_series_equal(result, expected) # MultiIndex columns: df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:])) result = df.stack(1) expected = DataFrame(1, index=MultiIndex.from_product([levels[0], levels[2]]), columns=levels[1]) assert_frame_equal(result, expected) # as above, but used labels in level are actually of homogeneous type result = df[['a', 'b']].stack(1) expected = expected[['a', 'b']] assert_frame_equal(result, expected)
def test_compute_forward_returns(self): dr = date_range(start='2015-1-1', end='2015-1-3') prices = DataFrame(index=dr, columns=['A', 'B'], data=[[1, 1], [1, 2], [2, 1]]) factor = prices.stack() fp = compute_forward_returns(factor, prices, periods=[1, 2]) ix = MultiIndex.from_product([dr, ['A', 'B']], names=['date', 'asset']) expected = DataFrame(index=ix, columns=['1D', '2D']) expected['1D'] = [0., 1., 1., -0.5, nan, nan] expected['2D'] = [1., 0., nan, nan, nan, nan] assert_frame_equal(fp, expected)
def write(self, arctic_lib, version, symbol, item, previous_version): if np.product(item.shape) == 0: # Currently not supporting zero size panels as they drop indices when converting to dataframes # Plan is to find a better solution in due course. raise ValueError('Cannot insert a zero size panel into mongo.') if not np.all(len(i.names) == 1 for i in item.axes): raise ValueError('Cannot insert panels with multiindexes') item = item.to_frame() if len(set(item.dtypes)) == 1: # If all columns have the same dtype, we support non-string column names. # We know from above check that columns is not a multiindex. item = DataFrame(item.stack()) elif item.columns.dtype != np.dtype('object'): raise ValueError('Cannot support non-object dtypes for columns') super(PandasPanelStore, self).write(arctic_lib, version, symbol, item, previous_version)
def testDf2(): ''' creates test dataframe ''' data = {'int':[1,2,3], 'float':[1.5,2.5,3.5], 'string':['a','b','c'], 'nan':[np.nan,np.nan,np.nan]} df = DataFrame(data, index=Index(['AAA','BBB','CCC']), columns=['int','float','string','nan']) df.index.names = ['letters'] df2 = DataFrame( { 'a' : [1,3] , 'b' : [2 ,4]} , index = ['one','two'] ) df2 = df2.stack() return df2
def slide_9(): data = pd.read_csv(MACRODATAPATH) periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date') data = DataFrame(data.to_records(), columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'), index=periods.to_timestamp('D', 'end')) ldata = data.stack().reset_index().rename(columns={0: 'value'}) wdata = ldata.pivot('date', 'item', 'value') print ldata[:10] pivoted = ldata.pivot('date', 'item', 'value') print pivoted.head() ldata['value2'] = np.random.randn(len(ldata)) print ldata[:10] pivoted = ldata.pivot('date', 'item') print pivoted[:5] print pivoted['value'][:5] unstacked = ldata.set_index(['date', 'item']).unstack('item') print unstacked[:7]
def slide_8(): data = DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index(['Ohio', 'Colorado'], name='state'), columns=pd.Index(['one', 'two', 'three'], name='number')) print data result = data.stack() print '***stack()***' print result print '***unstack()***' print result.unstack() print '***unstack(0)***' print result.unstack(0) print "***unstack('state')***" print result.unstack('state') s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd']) s2 = Series([4, 5, 6], index=['c', 'd', 'e']) data2 = pd.concat([s1, s2], keys=['one', 'two']) print '***unstack***' print data2.unstack() print '***unstack->stack***' print data2.unstack().stack() print '***unstack->stack(dropna)***' print data2.unstack().stack(dropna=False) df = DataFrame({'left': result, 'right': result + 5}, columns=pd.Index(['left', 'right'], name='side')) print 'df' print df print "unstack('state')" print df.unstack('state') print "unstack('state').stack('side')" print df.unstack('state').stack('side')
def distance_map(self, within, between, metric='wminkowski', p=2.0): r"""Create a distance map from the current electrode configuration. This method performs some type checking on its arguments. Parameters ---------- within, between : number `between` is the distance between shanks and `within` is the distance between electrodes on any given shank. metric : str or callable, optional Metric to use to calculate the distance between electrodes/shanks. Defaults to a weighted Minkowski distance p : numbers.Real, optional The :math:`p` of the norm to use. Defaults to 2.0 for weighted Euclidean distance. Notes ----- The default `metric` of ``'wminkowski'`` and the default `p` of ``2.0`` combine to give a weighted Euclidean distance metric. The weighted Minkowski distance between two points :math:`\mathbf{x},\mathbf{y}\in\mathbb{R}^{n}`, and a weight vector :math:`\mathbf{w}\in\mathbb{R}^{n}` is given by .. math:: \left(\sum_{i=1}^{n}w_i\left|x_i-y_i\right|^{p}\right)^{1/p} Raises ------ AssertionError * If `within` is not an instance of ``numbers.Real`` * If `between` is not an instance of ``numbers.Real`` * If `p` is not an instance of ``numbers.Real`` * If metric is not an instance of ``basestring`` or a callable Returns ------- df : DataFrame A dataframe with pairwise distances between electrodes, indexed by channel, shank. """ assert isinstance(within, numbers.Real) and within > 0, \ '"within" must be a positive real number' assert isinstance(between, numbers.Real) and between > 0, \ '"between" must be a positive real number' assert isinstance(metric, basestring) or callable(metric), \ '"metric" must be a callable object or a string' assert isinstance(p, numbers.Real) and p > 0, \ 'p must be a real number greater than 0' dm = distance_map(self.nshanks, self.shank.nunique(), within, between, metric=metric, p=p) s = self.sort() cols = s.index, s.shank values_getter = operator.attrgetter('values') cols = tuple(map(values_getter, cols)) names = 'channel', 'shank' def _label_maker(i, names): new_names = tuple(map(lambda x: x + ' %s' % i, names)) return MultiIndex.from_arrays(cols, names=new_names) index = _label_maker('i', names) columns = _label_maker('j', names) df = DataFrame(dm, index=index, columns=columns) nnames = len(names) ninds = len(index) nlevels = nnames * ninds zipped = zip(xrange(nnames), xrange(nnames, nlevels)) reordering = tuple(reduce(operator.add, zipped)) s = df.stack(0) for _ in xrange(nnames - 1): s = s.stack(0) s.name = r'$d\left(i, j\right)$' return s.reorder_levels(reordering)
df1 = DataFrame({'a': [1., np.nan, 5., np.nan], 'b': [np.nan, 2., np.nan, 6.], 'c': range(2, 18, 4)}) df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.], 'b': [np.nan, 3., 4., 6., 8.]}) df1.combine_first(df2) ###重塑层次化索引 #1 data = DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index(['Ohio', 'Colorado'], name='state'), columns=pd.Index(['one', 'two', 'three'], name='number')) data result = data.stack() result result.unstack() result.unstack(0) result.unstack('state') #2 s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd']) s2 = Series([4, 5, 6], index=['c', 'd', 'e']) data2 = pd.concat([s1, s2], keys=['one', 'two']) data2.unstack() data2.unstack().stack()
############################################################### ### ### ### ### ### PIVOTING ### ### ### ### ### ############################################################### ## stacking and unstacking a data frame df1 = DataFrame(np.arange(8).reshape(2,4), index = pd.Index(['LA','SF'],name='city'), columns= pd.Index(['A','B','C','D'], name = 'letters')) # pd.Index enables naming of the columns or index df_st = df1.stack() # pivots rows into columns df_st.unstack() # unpivots the above operation df_st.unstack('city') # will ensure that 'city' are the columns # to go from long data frame to wide data frame, we can use the pivot function # the pivot function is also useful in the excel pivot kind of way dframe.pivot_table(index=['zone'], columns=['Stories','homebath'], values=['homeprice'], aggfunc='mean') # cross tab frequency of occurences pd.crosstab(dframe.homebath, dframe.homebr, margins = True) ############################################################### ### ### ### ###
class ExcelSheet(): def __init__(self, connection, sheet, sql, stack=False, icol=False): self.sheet = sheet self.connection = connection self.recordset = Dispatch('ADODB.Recordset') self.recordset.Open(sql, self.connection, 0, 1) self.stack = stack # self.sd = SD self.icol = icol self.df = None print(' |--connect to {0}'.format(self.sheet)) def column_dates(self): # current_month_days cmd = monthrange(SD.year, SD.month + 1) # current_month_last_day cmld = date(SD.year, SD.month + 1, cmd[1]) return [SD + timedelta(days=i) for i in range(cmd[1])] + \ [cmld - timedelta(days=1) for j in range((len(self.recordset.Fields)-cmd[1]-3))] def column_names(self): if self.stack: return ['产品图号', '产品类别'] + \ self.column_dates() + ['合计'] else: return [field.Name for field in self.recordset.Fields] def data(self): self.df = DataFrame(data=list(self.recordset.GetRows())) self.df = self.df.T self.df.columns = self.column_names() if self.stack: self.df = self.df.set_index(['产品图号', '工序']) else: self.df = self.df.set_index('产品图号') if self.sheet == '东海.外协': col_names = ['产品类别', '外协盘存', '本月出库', '本月入库', '本月结存'] + \ [SD + timedelta(days=i) for i in range(31)] + ['合计'] + \ [SD + timedelta(days=i) for i in range(31)] + ['合计1'] self.df.columns = col_names del self.df['产品类别'] dd = self.df.iloc[:, 0:1] # dd.columns = ['外协盘存'] dd = dd[dd['外协盘存'] > 0] dd['外协盘存'] = dd['外协盘存'].astype(int) dd.to_sql(name='东海.外协盘点', con=CONN, flavor='sqlite', if_exists='replace') del self.df['合计'] del self.df['合计1'] del self.df['外协盘存'] del self.df['本月出库'] del self.df['本月入库'] del self.df['本月结存'] df1 = self.df.iloc[:, 0:31] df1 = df1.stack() df1.index.names = ['产品图号', '日期'] df1 = df1.to_frame() df1.columns = ['数量'] df1.insert(loc=1, column='工序', value='W1TO') df2 = self.df.iloc[:, 0:31] df2 = df2.stack() df2.index.names = ['产品图号', '日期'] df2 = df2.to_frame() df2.columns = ['数量'] df2.insert(loc=1, column='工序', value='WOT1') self.df = DataFrame(concat([df1, df2])) self.df['数量'] = self.df['数量'].astype(int) self.df = self.df[self.df['数量'] != 0] self.df = self.conditions() return self.df def conditions(self): if self.stack: for cn in self.df.columns: if cn in ['直径', '长度', '成品长度', '产品类别', '合计']: del self.df[cn] self.df = self.df.stack() self.df.index.names = ['产品图号', '日期'] self.df = self.df.to_frame() self.df.columns = ['数量'] self.df['数量'] = self.df['数量'].astype(int) self.df = self.df[self.df['数量'] != 0] if self.icol: self.df.insert(loc=1, column='工序', value=self.icol) for cn in self.df.columns: if cn in ['W3盘存', 'W2盘存', 'W4盘存']: self.df[cn] = self.df[cn].astype(int) if self.sheet in ['一部.工序监控', 'G加.工序数据']: self.df['日期'] = self.df['日期'].apply(convert_data) self.df['日期'] = self.df['日期'].dt.date self.df['数量'] = self.df['数量'].astype(int) if self.sheet == 'G加.工序数据': self.df['工序'] = self.df['工序'].replace( ['粗磨', '淬火', '回火', '半中磨', '精车', '镀前磨削', '电镀', '镀后', 'GP12检验'], ['W4粗磨', 'W4淬火', 'W4回火', 'W4中磨', 'W4精车', 'W4镀前', 'W4电镀', 'W4镀后', 'W4GP12']) self.df = self.df[self.df['工序'].str.startswith('W4')] return self.df def to_db(self): self.df.to_sql(name=self.sheet, con=CONN, flavor='sqlite', if_exists='replace') def close(self): try: self.recordset.Close() del self.recordset except: pass def __del__(self): # self.close() pass
class TestMultiLevel(unittest.TestCase): def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], names=['first']) # create test series object arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) s[3] = np.NaN self.series = s tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work self.ymd.index.levels = [lev.astype('i8') for lev in self.ymd.index.levels] self.ymd.index.names = ['year', 'month', 'day'] def test_append(self): a, b = self.frame[:5], self.frame[5:] result = a.append(b) tm.assert_frame_equal(result, self.frame) result = a['A'].append(b['A']) tm.assert_series_equal(result, self.frame['A']) def test_reindex_level(self): # axis=0 month_sums = self.ymd.sum(level='month') result = month_sums.reindex(self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum) assert_frame_equal(result, expected) # Series result = month_sums['A'].reindex(self.ymd.index, level=1) expected = self.ymd['A'].groupby(level='month').transform(np.sum) assert_series_equal(result, expected) # axis=1 month_sums = self.ymd.T.sum(axis=1, level='month') result = month_sums.reindex(columns=self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum).T assert_frame_equal(result, expected) def test_binops_level(self): def _check_op(opname): op = getattr(DataFrame, opname) month_sums = self.ymd.sum(level='month') result = op(self.ymd, month_sums, level='month') broadcasted = self.ymd.groupby(level='month').transform(np.sum) expected = op(self.ymd, broadcasted) assert_frame_equal(result, expected) # Series op = getattr(Series, opname) result = op(self.ymd['A'], month_sums['A'], level='month') broadcasted = self.ymd['A'].groupby(level='month').transform(np.sum) expected = op(self.ymd['A'], broadcasted) assert_series_equal(result, expected) _check_op('sub') _check_op('add') _check_op('mul') _check_op('div') def test_pickle(self): import cPickle def _test_roundtrip(frame): pickled = cPickle.dumps(frame) unpickled = cPickle.loads(pickled) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) _test_roundtrip(self.ymd) _test_roundtrip(self.ymd.T) def test_reindex(self): reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] expected = self.frame.ix[[0, 3]] assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) self.assert_(chunk.index is new_index) chunk = self.ymd.ix[new_index] self.assert_(chunk.index is new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) self.assert_(chunk.columns is new_index) chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) def test_sort_index_preserve_levels(self): result = self.frame.sort_index() self.assertEquals(result.index.names, self.frame.index.names) def test_repr_to_string(self): repr(self.frame) repr(self.ymd) repr(self.frame.T) repr(self.ymd.T) buf = StringIO() self.frame.to_string(buf=buf) self.ymd.to_string(buf=buf) self.frame.T.to_string(buf=buf) self.ymd.T.to_string(buf=buf) def test_getitem_simple(self): df = self.frame.T col = df['foo', 'one'] assert_almost_equal(col.values, df.values[:, 0]) self.assertRaises(KeyError, df.__getitem__, ('foo', 'four')) self.assertRaises(KeyError, df.__getitem__, 'foobar') def test_series_getitem(self): s = self.ymd['A'] result = s[2000, 3] result2 = s.ix[2000, 3] expected = s[42:65] expected.index = expected.index.droplevel(0).droplevel(0) assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] self.assertEquals(result, expected) # fancy result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s[49:51] assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) def test_series_setitem(self): s = self.ymd['A'] s[2000, 3] = np.nan self.assert_(isnull(s[42:65]).all()) self.assert_(notnull(s[:42]).all()) self.assert_(notnull(s[65:]).all()) s[2000, 3, 10] = np.nan self.assert_(isnull(s[49])) def test_series_slice_partial(self): pass def test_xs(self): xs = self.frame.xs(('bar', 'two')) xs2 = self.frame.ix[('bar', 'two')] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) def test_xs_partial(self): result = self.frame.xs('foo') result2 = self.frame.ix['foo'] expected = self.frame.T['foo'].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_fancy_2d(self): result = self.frame.ix['foo', 'B'] expected = self.frame.xs('foo')['B'] assert_series_equal(result, expected) ft = self.frame.T result = ft.ix['B', 'foo'] expected = ft.xs('B')['foo'] assert_series_equal(result, expected) def test_get_loc_single_level(self): s = Series(np.random.randn(len(self.single_level)), index=self.single_level) for k in self.single_level.values: s[k] def test_getitem_toplevel(self): df = self.frame.T result = df['foo'] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) result = df['bar'] result2 = df.ix[:, 'bar'] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_getitem_slice_integers(self): index = MultiIndex(levels=[[0, 1, 2], [0, 2]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=['a', 'b', 'c', 'd']) res = frame.ix[1:2] exp = frame[2:] assert_frame_equal(res, exp) series = Series(np.random.randn(len(index)), index=index) res = series.ix[1:2] exp = series[2:] assert_series_equal(res, exp) def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, labels=labels) frame = DataFrame(np.random.randn(6, 2), index=index) result = frame.ix[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) # raises exception self.assertRaises(KeyError, frame.ix.__getitem__, 3) # however this will work result = self.frame.ix[2] expected = self.frame.xs(self.frame.index[2]) assert_series_equal(result, expected) def test_getitem_partial(self): ymd = self.ymd.T result = ymd[2000, 2] expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) assert_frame_equal(result, expected) def test_getitem_slice_not_sorted(self): df = self.frame.sortlevel(1).T # buglet with int typechecking result = df.ix[:, :np.int32(3)] expected = df.reindex(columns=df.columns[:3]) assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft['foo', 'two'] dft['foo', 'two'] = s > s.median() assert_series_equal(dft['foo', 'two'], s > s.median()) self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) reindexed = dft.reindex(columns=[('foo', 'two')]) assert_series_equal(reindexed['foo', 'two'], s > s.median()) def test_frame_setitem_ix(self): self.frame.ix[('bar', 'two'), 'B'] = 5 self.assertEquals(self.frame.ix[('bar', 'two'), 'B'], 5) # with integer labels df = self.frame.copy() df.columns = range(3) df.ix[('bar', 'two'), 1] = 7 self.assertEquals(df.ix[('bar', 'two'), 1], 7) def test_fancy_slice_partial(self): result = self.frame.ix['bar':'baz'] expected = self.frame[3:7] assert_frame_equal(result, expected) result = self.ymd.ix[(2000,2):(2000,4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) def test_sortlevel(self): df = self.frame.copy() df.index = np.arange(len(df)) self.assertRaises(Exception, df.sortlevel, 0) # axis=1 # series a_sorted = self.frame['A'].sortlevel(0) self.assertRaises(Exception, self.frame.reset_index()['A'].sortlevel) # preserve names self.assertEquals(a_sorted.index.names, self.frame.index.names) def test_delevel_infer_dtype(self): tuples = [tuple for tuple in cart_product(['foo', 'bar'], [10, 20], [1.0, 1.1])] index = MultiIndex.from_tuples(tuples, names=['prm0', 'prm1', 'prm2']) df = DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'], index=index) deleveled = df.reset_index() self.assert_(com.is_integer_dtype(deleveled['prm1'])) self.assert_(com.is_float_dtype(deleveled['prm2'])) def test_sortlevel_by_name(self): self.frame.index.names = ['first', 'second'] result = self.frame.sortlevel(level='second') expected = self.frame.sortlevel(level=1) assert_frame_equal(result, expected) def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) df = self.frame.copy() df['foo'] = 'bar' sorted_after = df.sortlevel(1) assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) dft = self.frame.T sorted_before = dft.sortlevel(1, axis=1) dft['foo', 'three'] = 'bar' sorted_after = dft.sortlevel(1, axis=1) assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), sorted_after.drop([('foo', 'three')], axis=1)) def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) expected = expected.reindex_like(result).astype('i8') assert_frame_equal(result, expected) self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan self.ymd.ix[1, [1, 2]] = np.nan self.ymd.ix[7, [0, 1]] = np.nan _check_counts(self.frame) _check_counts(self.ymd) _check_counts(self.frame.T, axis=1) _check_counts(self.ymd.T, axis=1) # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) self.frame['D'] = 'foo' result = self.frame.count(level=0, numeric_only=True) assert_almost_equal(result.columns, ['A', 'B', 'C']) def test_count_level_series(self): index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two', 'three', 'four']], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) def test_count_level_corner(self): s = self.frame['A'][:0] result = s.count(level=0) expected = Series(0, index=s.index.levels[0]) assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) expected = DataFrame({}, index=s.index.levels[0], columns=df.columns).fillna(0).astype(int) assert_frame_equal(result, expected) def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() unstacked2 = unstacked.unstack() # test that ints work unstacked = self.ymd.astype(int).unstack() def test_stack(self): # regular roundtrip unstacked = self.ymd.unstack() restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) unlexsorted = self.ymd.sortlevel(2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() unstacked = unstacked.sort(axis=1, ascending=False) restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) # more than 2 levels in the columns unstacked = self.ymd.unstack(1).unstack(1) result = unstacked.stack(1) expected = self.ymd.unstack() assert_frame_equal(result, expected) result = unstacked.stack(2) expected = self.ymd.unstack(1) assert_frame_equal(result, expected) result = unstacked.stack(0) expected = self.ymd.stack().unstack(1).unstack(1) assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = self.ymd.unstack(2).ix[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) # stack with negative number result = self.ymd.unstack(0).stack(-2) expected = self.ymd.unstack(0).stack(0) def test_stack_mixed_dtype(self): df = self.frame.T df['foo', 'four'] = 'foo' df = df.sortlevel(1, axis=1) stacked = df.stack() assert_series_equal(stacked['foo'], df['foo'].stack()) self.assert_(stacked['bar'].dtype == np.float_) def test_unstack_bug(self): df = DataFrame({'state': ['naive','naive','naive', 'activ','activ','activ'], 'exp':['a','b','b','b','a','a'], 'barcode':[1,2,3,4,1,3], 'v':['hi','hi','bye','bye','bye','peace'], 'extra': np.arange(6.)}) result = df.groupby(['state','exp','barcode','v']).apply(len) unstacked = result.unstack() restacked = unstacked.stack() assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() self.assertEquals(unstacked.index.name, 'first') self.assertEquals(unstacked.columns.names, ['exp', 'second']) restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) def test_unstack_level_name(self): result = self.frame.unstack('second') expected = self.frame.unstack(level=1) assert_frame_equal(result, expected) def test_stack_level_name(self): unstacked = self.frame.unstack('second') result = unstacked.stack('exp') expected = self.frame.unstack().stack(0) assert_frame_equal(result, expected) result = self.frame.stack('exp') expected = self.frame.stack() assert_series_equal(result, expected) def test_stack_unstack_multiple(self): unstacked = self.ymd.unstack(['year', 'month']) expected = self.ymd.unstack('year').unstack('month') assert_frame_equal(unstacked, expected) self.assertEquals(unstacked.columns.names, expected.columns.names) # series s = self.ymd['A'] s_unstacked = s.unstack(['year', 'month']) assert_frame_equal(s_unstacked, expected['A']) restacked = unstacked.stack(['year', 'month']) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sortlevel(0) assert_frame_equal(restacked, self.ymd) self.assertEquals(restacked.index.names, self.ymd.index.names) # GH #451 unstacked = self.ymd.unstack([1, 2]) expected = self.ymd.unstack(1).unstack(1) assert_frame_equal(unstacked, expected) unstacked = self.ymd.unstack([2, 1]) expected = self.ymd.unstack(2).unstack(1) assert_frame_equal(unstacked, expected) def test_groupby_transform(self): s = self.frame['A'] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) assert_series_equal(applied.reindex(expected.index), expected) def test_join(self): a = self.frame.ix[:5, ['A']] b = self.frame.ix[2:, ['B', 'C']] joined = a.join(b, how='outer').reindex(self.frame.index) expected = self.frame.copy() expected.values[np.isnan(joined.values)] = np.nan self.assert_(not np.isnan(joined.values).all()) assert_frame_equal(joined, expected) def test_swaplevel(self): swapped = self.frame['A'].swaplevel(0, 1) swapped2 = self.frame['A'].swaplevel('first', 'second') self.assert_(not swapped.index.equals(self.frame.index)) assert_series_equal(swapped, swapped2) back = swapped.swaplevel(0, 1) back2 = swapped.swaplevel('second', 'first') self.assert_(back.index.equals(self.frame.index)) assert_series_equal(back, back2) ft = self.frame.T swapped = ft.swaplevel('first', 'second', axis=1) exp = self.frame.swaplevel('first', 'second').T assert_frame_equal(swapped, exp) def test_swaplevel_panel(self): panel = Panel({'ItemA' : self.frame, 'ItemB' : self.frame * 2}) result = panel.swaplevel(0, 1, axis='major') expected = panel.copy() expected.major_axis = expected.major_axis.swaplevel(0, 1) tm.assert_panel_equal(result, expected) def test_reorder_levels(self): result = self.ymd.reorder_levels(['month', 'day', 'year']) expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) assert_frame_equal(result, expected) result = self.ymd['A'].reorder_levels(['month', 'day', 'year']) expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2) assert_series_equal(result, expected) result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1) expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) assert_frame_equal(result, expected) self.assertRaises(Exception, self.ymd.index.reorder_levels, [1, 2, 3]) def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] self.assert_(isinstance(df.columns, MultiIndex)) self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all()) def test_alignment(self): x = Series(data=[1,2,3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B",3)])) y = Series(data=[4,5,6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B",3)])) res = x - y exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) # hit non-monotonic code path res = x[::-1] - y[::-1] exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) self.assert_(index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) self.assert_(not index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) self.assert_(not index.is_lexsorted()) self.assert_(index.lexsort_depth == 0) def test_frame_getitem_view(self): df = self.frame.T df['foo'].values[:] = 0 self.assert_((df['foo'].values == 0).all()) # but not if it's mixed-type df['foo', 'four'] = 'foo' df = df.sortlevel(0, axis=1) df['foo']['one'] = 2 self.assert_((df['foo', 'one'] == 0).all()) def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())] result = df['foo'] result2 = df.ix[:, 'foo'] expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) df = df.T result = df.xs('foo') result2 = df.ix['foo'] expected = df.reindex(df.index[arrays[0] == 'foo']) expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.get_tuple_index())] result = s['qux'] result2 = s.ix['qux'] expected = s[arrays[0] == 'qux'] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var'] def test_series_group_min_max(self): for op, level, skipna in cart_product(self.AGG_FUNCTIONS, range(2), [False, True]): grouped = self.series.groupby(level=level) aggf = lambda x: getattr(x, op)(skipna=skipna) # skipna=True leftside = grouped.agg(aggf) rightside = getattr(self.series, op)(level=level, skipna=skipna) assert_series_equal(leftside, rightside) def test_frame_group_ops(self): self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, range(2), range(2), [False, True]): if axis == 0: frame = self.frame else: frame = self.frame.T grouped = frame.groupby(level=level, axis=axis) aggf = lambda x: getattr(x, op)(skipna=skipna, axis=axis) leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] self.assert_(leftside._get_axis(axis).equals(level_index)) self.assert_(rightside._get_axis(axis).equals(level_index)) assert_frame_equal(leftside, rightside) def test_frame_series_agg_multiple_levels(self): result = self.ymd.sum(level=['year', 'month']) expected = self.ymd.groupby(level=['year', 'month']).sum() assert_frame_equal(result, expected) result = self.ymd['A'].sum(level=['year', 'month']) expected = self.ymd['A'].groupby(level=['year', 'month']).sum() assert_series_equal(result, expected) def test_groupby_multilevel(self): result = self.ymd.groupby(level=[0, 1]).mean() k1 = self.ymd.index.get_level_values(0) k2 = self.ymd.index.get_level_values(1) expected = self.ymd.groupby([k1, k2]).mean() assert_frame_equal(result, expected) self.assertEquals(result.index.names, self.ymd.index.names[:2]) result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() assert_frame_equal(result, result2) def test_groupby_multilevel_with_transform(self): pass def test_multilevel_consolidate(self): index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), ('bar', 'one'), ('bar', 'two')]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df['Totals', ''] = df.sum(1) df = df.consolidate() def test_ix_preserve_names(self): result = self.ymd.ix[2000] result2 = self.ymd['A'].ix[2000] self.assertEquals(result.index.names, self.ymd.index.names[1:]) self.assertEquals(result2.index.names, self.ymd.index.names[1:]) result = self.ymd.ix[2000, 2] result2 = self.ymd['A'].ix[2000, 2] self.assertEquals(result.index.name, self.ymd.index.names[2]) self.assertEquals(result2.index.name, self.ymd.index.names[2]) def test_partial_set(self): # GH #397 df = self.ymd.copy() exp = self.ymd.copy() df.ix[2000, 4] = 0 exp.ix[2000, 4].values[:] = 0 assert_frame_equal(df, exp) df['A'].ix[2000, 4] = 1 exp['A'].ix[2000, 4].values[:] = 1 assert_frame_equal(df, exp) df.ix[2000] = 5 exp.ix[2000].values[:] = 5 assert_frame_equal(df, exp) # this works...for now df['A'].ix[14] = 5 self.assertEquals(df['A'][14], 5) def test_unstack_preserve_types(self): # GH #403 self.ymd['E'] = 'foo' self.ymd['F'] = 2 unstacked = self.ymd.unstack('month') self.assert_(unstacked['A', 1].dtype == np.float64) self.assert_(unstacked['E', 1].dtype == np.object_) self.assert_(unstacked['F', 1].dtype == np.float64) def test_partial_ix_missing(self): result = self.ymd.ix[2000, 0] expected = self.ymd.ix[2000]['A'] assert_series_equal(result, expected) # need to put in some work here # self.ymd.ix[2000, 0] = 0 # self.assert_((self.ymd.ix[2000]['A'] == 0).all()) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6)) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0) def test_to_html(self): self.ymd.columns.name = 'foo' self.ymd.to_html() self.ymd.T.to_html()
mergeL = pd.merge(employee, sales, on = 'ID', how = 'left') # A Left inner Join mergeR = pd.merge(employee, sales, on = 'ID', how = 'right') # A Right inner Join mergeO = pd.merge(employee, sales, on = 'ID', how = 'outer') # An Outer Join mergeM = pd.merge(sales, bonus, on = 'ID') # A many-to-many Join stack = pd.concat([employee, sales], ignore_index = True) # Vertical Stacking ############################################################################################################# # 8. Reshaping & Pivoting ############################################################################################################# df1 = DataFrame([['Big','LAX',3,np.nan],['Big','SFO',6,7],['Med','SEA-TAC',9,np.nan],['Small','POR',np.nan,np.nan]], index=pd.Index(['LA', 'SF', 'SEA', 'POR']), columns=pd.Index(['Type', 'Airport', 'Cool Factor','D'])) # .unstack(): used to convert columns into rows and into a hierarchical index df2 = df1.stack(dropna = False) # converts columns into the child index df3 = df1.unstack() # converts columns into the parent index # .pivot(index, columns, values) is used to reshape data like dplyr in R df4 = df1.pivot('Airport','Type','Cool Factor') # yes! its that easy to reshape! ############################################################################################################# # 9. Outlier Analysis ############################################################################################################# np.random.seed(12345) df = DataFrame(np.random.randn(1000,4)) df.describe() # assume outliers are in the -+3 region df[0][np.abs(df[0])>3] # show all rows in column 0 that are > abs(3) df[(np.abs(df)>3).any(1)] # show all values in the dataframe that are > abs(3) df[np.abs(df)>3] = np.sign(df) * 3 # caps all values > abs(3) to 3; .sign()
from pandas import DataFrame, Series import pandas as pd import numpy as np import datetime data = DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index(['Ohio', 'Colorado'], name='state'), columns=pd.Index(['one', 'two', 'three'], name='number')) print(data) result = data.stack() print(result) print(result.unstack()) print(result.unstack(0)) print(result.unstack('state')) s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd']) s2 = Series([4, 5, 6], index=['c', 'd', 'e']) data2 = pd.concat([s1, s2], keys=['one', 'two']) print(data2) print(data2.unstack()) print(data2.unstack().stack()) print(data2.unstack().stack(dropna=False)) df = DataFrame({'left': result, 'right': result + 5}, columns=pd.Index(['left', 'right'], name='side')) print(df) print(df.unstack('state')) print(df.unstack('state').stack('side'))
class TestMultiLevel(unittest.TestCase): def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], names=['first']) # create test series object arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) s[3] = np.NaN self.series = s tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work self.ymd.index.levels = [lev.astype('i8') for lev in self.ymd.index.levels] self.ymd.index.names = ['year', 'month', 'day'] def test_append(self): a, b = self.frame[:5], self.frame[5:] result = a.append(b) tm.assert_frame_equal(result, self.frame) result = a['A'].append(b['A']) tm.assert_series_equal(result, self.frame['A']) def test_dataframe_constructor(self): multi = DataFrame(np.random.randn(4, 4), index=[np.array(['a', 'a', 'b', 'b']), np.array(['x', 'y', 'x', 'y'])]) self.assert_(isinstance(multi.index, MultiIndex)) self.assert_(not isinstance(multi.columns, MultiIndex)) multi = DataFrame(np.random.randn(4, 4), columns=[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) self.assert_(isinstance(multi.columns, MultiIndex)) def test_series_constructor(self): multi = Series(1., index=[np.array(['a', 'a', 'b', 'b']), np.array(['x', 'y', 'x', 'y'])]) self.assert_(isinstance(multi.index, MultiIndex)) multi = Series(1., index=[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) self.assert_(isinstance(multi.index, MultiIndex)) multi = Series(range(4), index=[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) self.assert_(isinstance(multi.index, MultiIndex)) def test_reindex_level(self): # axis=0 month_sums = self.ymd.sum(level='month') result = month_sums.reindex(self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum) assert_frame_equal(result, expected) # Series result = month_sums['A'].reindex(self.ymd.index, level=1) expected = self.ymd['A'].groupby(level='month').transform(np.sum) assert_series_equal(result, expected) # axis=1 month_sums = self.ymd.T.sum(axis=1, level='month') result = month_sums.reindex(columns=self.ymd.index, level=1) expected = self.ymd.groupby(level='month').transform(np.sum).T assert_frame_equal(result, expected) def test_binops_level(self): def _check_op(opname): op = getattr(DataFrame, opname) month_sums = self.ymd.sum(level='month') result = op(self.ymd, month_sums, level='month') broadcasted = self.ymd.groupby(level='month').transform(np.sum) expected = op(self.ymd, broadcasted) assert_frame_equal(result, expected) # Series op = getattr(Series, opname) result = op(self.ymd['A'], month_sums['A'], level='month') broadcasted = self.ymd['A'].groupby(level='month').transform(np.sum) expected = op(self.ymd['A'], broadcasted) assert_series_equal(result, expected) _check_op('sub') _check_op('add') _check_op('mul') _check_op('div') def test_pickle(self): import cPickle def _test_roundtrip(frame): pickled = cPickle.dumps(frame) unpickled = cPickle.loads(pickled) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) _test_roundtrip(self.ymd) _test_roundtrip(self.ymd.T) def test_reindex(self): reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] expected = self.frame.ix[[0, 3]] assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) self.assert_(chunk.index is new_index) chunk = self.ymd.ix[new_index] self.assert_(chunk.index is new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) self.assert_(chunk.columns is new_index) chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) def test_sort_index_preserve_levels(self): result = self.frame.sort_index() self.assertEquals(result.index.names, self.frame.index.names) def test_repr_to_string(self): repr(self.frame) repr(self.ymd) repr(self.frame.T) repr(self.ymd.T) buf = StringIO() self.frame.to_string(buf=buf) self.ymd.to_string(buf=buf) self.frame.T.to_string(buf=buf) self.ymd.T.to_string(buf=buf) def test_repr_name_coincide(self): index = MultiIndex.from_tuples([('a', 0, 'foo'), ('b', 1, 'bar')], names=['a', 'b', 'c']) df = DataFrame({'value': [0, 1]}, index=index) lines = repr(df).split('\n') self.assert_(lines[2].startswith('a 0 foo')) def test_getitem_simple(self): df = self.frame.T col = df['foo', 'one'] assert_almost_equal(col.values, df.values[:, 0]) self.assertRaises(KeyError, df.__getitem__, ('foo', 'four')) self.assertRaises(KeyError, df.__getitem__, 'foobar') def test_series_getitem(self): s = self.ymd['A'] result = s[2000, 3] result2 = s.ix[2000, 3] expected = s.reindex(s.index[42:65]) expected.index = expected.index.droplevel(0).droplevel(0) assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] self.assertEquals(result, expected) # fancy result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s.reindex(s.index[49:51]) assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) def test_series_getitem_corner(self): s = self.ymd['A'] # don't segfault, GH #495 # out of bounds access self.assertRaises(IndexError, s.__getitem__, len(self.ymd)) # generator result = s[(x > 0 for x in s)] expected = s[s > 0] assert_series_equal(result, expected) def test_series_setitem(self): s = self.ymd['A'] s[2000, 3] = np.nan self.assert_(isnull(s.values[42:65]).all()) self.assert_(notnull(s.values[:42]).all()) self.assert_(notnull(s.values[65:]).all()) s[2000, 3, 10] = np.nan self.assert_(isnull(s[49])) def test_series_slice_partial(self): pass def test_frame_getitem_setitem_slice(self): # getitem result = self.frame.ix[:4] expected = self.frame[:4] assert_frame_equal(result, expected) # setitem cp = self.frame.copy() cp.ix[:4] = 0 self.assert_((cp.values[:4] == 0).all()) self.assert_((cp.values[4:] != 0).all()) def test_frame_getitem_setitem_multislice(self): levels = [['t1', 't2'], ['a','b','c']] labels = [[0,0,0,1,1], [0,1,2,0,1]] midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id']) df = DataFrame({'value':[1,2,3,7,8]}, index=midx) result = df.ix[:,'value'] assert_series_equal(df['value'], result) result = df.ix[1:3,'value'] assert_series_equal(df['value'][1:3], result) result = df.ix[:,:] assert_frame_equal(df, result) result = df df.ix[:, 'value'] = 10 result['value'] = 10 assert_frame_equal(df, result) df.ix[:,:] = 10 assert_frame_equal(df, result) def test_getitem_tuple_plus_slice(self): # GH #671 df = DataFrame({'a' : range(10), 'b' : range(10), 'c' : np.random.randn(10), 'd' : np.random.randn(10)}) idf = df.set_index(['a', 'b']) result = idf.ix[(0, 0), :] expected = idf.ix[0, 0] expected2 = idf.xs((0, 0)) assert_series_equal(result, expected) assert_series_equal(result, expected2) def test_getitem_setitem_tuple_plus_columns(self): # GH #1013 df = self.ymd[:5] result = df.ix[(2000, 1, 6), ['A', 'B', 'C']] expected = df.ix[2000, 1, 6][['A', 'B', 'C']] assert_series_equal(result, expected) def test_xs(self): xs = self.frame.xs(('bar', 'two')) xs2 = self.frame.ix[('bar', 'two')] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) def test_xs_partial(self): result = self.frame.xs('foo') result2 = self.frame.ix['foo'] expected = self.frame.T['foo'].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_xs_level(self): result = self.frame.xs('two', level='second') expected = self.frame[self.frame.index.get_level_values(1) == 'two'] expected.index = expected.index.droplevel(1) assert_frame_equal(result, expected) index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), ('p', 'q', 'r')]) df = DataFrame(np.random.randn(3, 5), index=index) result = df.xs('c', level=2) expected = df[1:2] expected.index = expected.index.droplevel(2) assert_frame_equal(result, expected) def test_xs_level_multiple(self): from pandas import read_table from StringIO import StringIO text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep='\s+') result = df.xs(('a', 4), level=['one', 'four']) expected = df.xs('a').xs(4, level='four') assert_frame_equal(result, expected) def test_xs_level0(self): from pandas import read_table from StringIO import StringIO text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep='\s+') result = df.xs('a', level=0) expected = df.xs('a') self.assertEqual(len(result), 2) assert_frame_equal(result, expected) def test_xs_level_series(self): s = self.frame['A'] result = s[:, 'two'] expected = self.frame.xs('two', level=1)['A'] assert_series_equal(result, expected) s = self.ymd['A'] result = s[2000, 5] expected = self.ymd.ix[2000, 5]['A'] assert_series_equal(result, expected) # not implementing this for now self.assertRaises(TypeError, s.__getitem__, (2000, slice(3, 4))) # result = s[2000, 3:4] # lv =s.index.get_level_values(1) # expected = s[(lv == 3) | (lv == 4)] # expected.index = expected.index.droplevel(0) # assert_series_equal(result, expected) # can do this though def test_get_loc_single_level(self): s = Series(np.random.randn(len(self.single_level)), index=self.single_level) for k in self.single_level.values: s[k] def test_getitem_toplevel(self): df = self.frame.T result = df['foo'] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) result = df['bar'] result2 = df.ix[:, 'bar'] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_getitem_setitem_slice_integers(self): index = MultiIndex(levels=[[0, 1, 2], [0, 2]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=['a', 'b', 'c', 'd']) res = frame.ix[1:2] exp = frame.reindex(frame.index[2:]) assert_frame_equal(res, exp) frame.ix[1:2] = 7 self.assert_((frame.ix[1:2] == 7).values.all()) series = Series(np.random.randn(len(index)), index=index) res = series.ix[1:2] exp = series.reindex(series.index[2:]) assert_series_equal(res, exp) series.ix[1:2] = 7 self.assert_((series.ix[1:2] == 7).values.all()) def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, labels=labels) frame = DataFrame(np.random.randn(6, 2), index=index) result = frame.ix[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) # raises exception self.assertRaises(KeyError, frame.ix.__getitem__, 3) # however this will work result = self.frame.ix[2] expected = self.frame.xs(self.frame.index[2]) assert_series_equal(result, expected) def test_getitem_partial(self): ymd = self.ymd.T result = ymd[2000, 2] expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) assert_frame_equal(result, expected) def test_getitem_slice_not_sorted(self): df = self.frame.sortlevel(1).T # buglet with int typechecking result = df.ix[:, :np.int32(3)] expected = df.reindex(columns=df.columns[:3]) assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft['foo', 'two'] dft['foo', 'two'] = s > s.median() assert_series_equal(dft['foo', 'two'], s > s.median()) self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) reindexed = dft.reindex(columns=[('foo', 'two')]) assert_series_equal(reindexed['foo', 'two'], s > s.median()) def test_frame_setitem_ix(self): self.frame.ix[('bar', 'two'), 'B'] = 5 self.assertEquals(self.frame.ix[('bar', 'two'), 'B'], 5) # with integer labels df = self.frame.copy() df.columns = range(3) df.ix[('bar', 'two'), 1] = 7 self.assertEquals(df.ix[('bar', 'two'), 1], 7) def test_fancy_slice_partial(self): result = self.frame.ix['bar':'baz'] expected = self.frame[3:7] assert_frame_equal(result, expected) result = self.ymd.ix[(2000,2):(2000,4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) def test_sortlevel(self): df = self.frame.copy() df.index = np.arange(len(df)) self.assertRaises(Exception, df.sortlevel, 0) # axis=1 # series a_sorted = self.frame['A'].sortlevel(0) self.assertRaises(Exception, self.frame.reset_index()['A'].sortlevel) # preserve names self.assertEquals(a_sorted.index.names, self.frame.index.names) def test_delevel_infer_dtype(self): tuples = [tuple for tuple in cart_product(['foo', 'bar'], [10, 20], [1.0, 1.1])] index = MultiIndex.from_tuples(tuples, names=['prm0', 'prm1', 'prm2']) df = DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'], index=index) deleveled = df.reset_index() self.assert_(com.is_integer_dtype(deleveled['prm1'])) self.assert_(com.is_float_dtype(deleveled['prm2'])) def test_reset_index_with_drop(self): deleveled = self.ymd.reset_index(drop = True) self.assertEquals(len(deleveled.columns), len(self.ymd.columns)) deleveled = self.series.reset_index() self.assert_(isinstance(deleveled, DataFrame)) self.assert_(len(deleveled.columns) == len(self.series.index.levels)+1) deleveled = self.series.reset_index(drop = True) self.assert_(isinstance(deleveled, Series)) def test_sortlevel_by_name(self): self.frame.index.names = ['first', 'second'] result = self.frame.sortlevel(level='second') expected = self.frame.sortlevel(level=1) assert_frame_equal(result, expected) def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) df = self.frame.copy() df['foo'] = 'bar' sorted_after = df.sortlevel(1) assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) dft = self.frame.T sorted_before = dft.sortlevel(1, axis=1) dft['foo', 'three'] = 'bar' sorted_after = dft.sortlevel(1, axis=1) assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), sorted_after.drop([('foo', 'three')], axis=1)) def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) expected = expected.reindex_like(result).astype('i8') assert_frame_equal(result, expected) self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan self.ymd.ix[1, [1, 2]] = np.nan self.ymd.ix[7, [0, 1]] = np.nan _check_counts(self.frame) _check_counts(self.ymd) _check_counts(self.frame.T, axis=1) _check_counts(self.ymd.T, axis=1) # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) self.frame['D'] = 'foo' result = self.frame.count(level=0, numeric_only=True) assert_almost_equal(result.columns, ['A', 'B', 'C']) def test_count_level_series(self): index = MultiIndex(levels=[['foo', 'bar', 'baz'], ['one', 'two', 'three', 'four']], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]]) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype('f8'), expected.reindex(result.index).fillna(0)) def test_count_level_corner(self): s = self.frame['A'][:0] result = s.count(level=0) expected = Series(0, index=s.index.levels[0]) assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) expected = DataFrame({}, index=s.index.levels[0], columns=df.columns).fillna(0).astype(int) assert_frame_equal(result, expected) def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() unstacked2 = unstacked.unstack() # test that ints work unstacked = self.ymd.astype(int).unstack() def test_unstack_multiple_no_empty_columns(self): index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0), (1, 'baz', 1), (1, 'qux', 1)]) s = Series(np.random.randn(4), index=index) unstacked = s.unstack([1, 2]) expected = unstacked.dropna(axis=1, how='all') assert_frame_equal(unstacked, expected) def test_stack(self): # regular roundtrip unstacked = self.ymd.unstack() restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) unlexsorted = self.ymd.sortlevel(2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() unstacked = unstacked.sort(axis=1, ascending=False) restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) # more than 2 levels in the columns unstacked = self.ymd.unstack(1).unstack(1) result = unstacked.stack(1) expected = self.ymd.unstack() assert_frame_equal(result, expected) result = unstacked.stack(2) expected = self.ymd.unstack(1) assert_frame_equal(result, expected) result = unstacked.stack(0) expected = self.ymd.stack().unstack(1).unstack(1) assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = self.ymd.unstack(2).ix[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) # stack with negative number result = self.ymd.unstack(0).stack(-2) expected = self.ymd.unstack(0).stack(0) def test_stack_mixed_dtype(self): df = self.frame.T df['foo', 'four'] = 'foo' df = df.sortlevel(1, axis=1) stacked = df.stack() assert_series_equal(stacked['foo'], df['foo'].stack()) self.assert_(stacked['bar'].dtype == np.float_) def test_unstack_bug(self): df = DataFrame({'state': ['naive','naive','naive', 'activ','activ','activ'], 'exp':['a','b','b','b','a','a'], 'barcode':[1,2,3,4,1,3], 'v':['hi','hi','bye','bye','bye','peace'], 'extra': np.arange(6.)}) result = df.groupby(['state','exp','barcode','v']).apply(len) unstacked = result.unstack() restacked = unstacked.stack() assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() self.assertEquals(unstacked.index.name, 'first') self.assertEquals(unstacked.columns.names, ['exp', 'second']) restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) def test_unstack_level_name(self): result = self.frame.unstack('second') expected = self.frame.unstack(level=1) assert_frame_equal(result, expected) def test_stack_level_name(self): unstacked = self.frame.unstack('second') result = unstacked.stack('exp') expected = self.frame.unstack().stack(0) assert_frame_equal(result, expected) result = self.frame.stack('exp') expected = self.frame.stack() assert_series_equal(result, expected) def test_stack_unstack_multiple(self): unstacked = self.ymd.unstack(['year', 'month']) expected = self.ymd.unstack('year').unstack('month') assert_frame_equal(unstacked, expected) self.assertEquals(unstacked.columns.names, expected.columns.names) # series s = self.ymd['A'] s_unstacked = s.unstack(['year', 'month']) assert_frame_equal(s_unstacked, expected['A']) restacked = unstacked.stack(['year', 'month']) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sortlevel(0) assert_frame_equal(restacked, self.ymd) self.assertEquals(restacked.index.names, self.ymd.index.names) # GH #451 unstacked = self.ymd.unstack([1, 2]) expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how='all') assert_frame_equal(unstacked, expected) unstacked = self.ymd.unstack([2, 1]) expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all') assert_frame_equal(unstacked, expected.ix[:, unstacked.columns]) def test_groupby_transform(self): s = self.frame['A'] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) assert_series_equal(applied.reindex(expected.index), expected) def test_groupby_corner(self): midx = MultiIndex(levels=[['foo'],['bar'],['baz']], labels=[[0],[0],[0]], names=['one','two','three']) df = DataFrame([np.random.rand(4)], columns=['a','b','c','d'], index=midx) # should work df.groupby(level='three') def test_join(self): a = self.frame.ix[:5, ['A']] b = self.frame.ix[2:, ['B', 'C']] joined = a.join(b, how='outer').reindex(self.frame.index) expected = self.frame.copy() expected.values[np.isnan(joined.values)] = np.nan self.assert_(not np.isnan(joined.values).all()) assert_frame_equal(joined, expected) def test_swaplevel(self): swapped = self.frame['A'].swaplevel(0, 1) swapped2 = self.frame['A'].swaplevel('first', 'second') self.assert_(not swapped.index.equals(self.frame.index)) assert_series_equal(swapped, swapped2) back = swapped.swaplevel(0, 1) back2 = swapped.swaplevel('second', 'first') self.assert_(back.index.equals(self.frame.index)) assert_series_equal(back, back2) ft = self.frame.T swapped = ft.swaplevel('first', 'second', axis=1) exp = self.frame.swaplevel('first', 'second').T assert_frame_equal(swapped, exp) def test_swaplevel_panel(self): panel = Panel({'ItemA' : self.frame, 'ItemB' : self.frame * 2}) result = panel.swaplevel(0, 1, axis='major') expected = panel.copy() expected.major_axis = expected.major_axis.swaplevel(0, 1) tm.assert_panel_equal(result, expected) def test_reorder_levels(self): result = self.ymd.reorder_levels(['month', 'day', 'year']) expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) assert_frame_equal(result, expected) result = self.ymd['A'].reorder_levels(['month', 'day', 'year']) expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2) assert_series_equal(result, expected) result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1) expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) assert_frame_equal(result, expected) self.assertRaises(Exception, self.ymd.index.reorder_levels, [1, 2, 3]) def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] self.assert_(isinstance(df.columns, MultiIndex)) self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all()) def test_alignment(self): x = Series(data=[1,2,3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B",3)])) y = Series(data=[4,5,6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B",3)])) res = x - y exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) # hit non-monotonic code path res = x[::-1] - y[::-1] exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) self.assert_(index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) self.assert_(not index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) self.assert_(not index.is_lexsorted()) self.assert_(index.lexsort_depth == 0) def test_frame_getitem_view(self): df = self.frame.T df['foo'].values[:] = 0 self.assert_((df['foo'].values == 0).all()) # but not if it's mixed-type df['foo', 'four'] = 'foo' df = df.sortlevel(0, axis=1) df['foo']['one'] = 2 self.assert_((df['foo', 'one'] == 0).all()) def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())] result = df['foo'] result2 = df.ix[:, 'foo'] expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) df = df.T result = df.xs('foo') result2 = df.ix['foo'] expected = df.reindex(df.index[arrays[0] == 'foo']) expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.get_tuple_index())] result = s['qux'] result2 = s.ix['qux'] expected = s[arrays[0] == 'qux'] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) def test_count(self): frame = self.frame.copy() frame.index.names = ['a', 'b'] result = frame.count(level='b') expect = self.frame.count(level=1) assert_frame_equal(result, expect) result = frame.count(level='a') expect = self.frame.count(level=0) assert_frame_equal(result, expect) series = self.series.copy() series.index.names = ['a', 'b'] result = series.count(level='b') expect = self.series.count(level=1) assert_series_equal(result, expect) result = series.count(level='a') expect = self.series.count(level=0) assert_series_equal(result, expect) self.assertRaises(Exception, series.count, 'x') self.assertRaises(Exception, frame.count, level='x') AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var'] def test_series_group_min_max(self): for op, level, skipna in cart_product(self.AGG_FUNCTIONS, range(2), [False, True]): grouped = self.series.groupby(level=level) aggf = lambda x: getattr(x, op)(skipna=skipna) # skipna=True leftside = grouped.agg(aggf) rightside = getattr(self.series, op)(level=level, skipna=skipna) assert_series_equal(leftside, rightside) def test_frame_group_ops(self): self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, range(2), range(2), [False, True]): if axis == 0: frame = self.frame else: frame = self.frame.T grouped = frame.groupby(level=level, axis=axis) pieces = [] def aggf(x): pieces.append(x) return getattr(x, op)(skipna=skipna, axis=axis) leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] self.assert_(leftside._get_axis(axis).equals(level_index)) self.assert_(rightside._get_axis(axis).equals(level_index)) assert_frame_equal(leftside, rightside) def test_std_var_pass_ddof(self): index = MultiIndex.from_arrays([np.arange(5).repeat(10), np.tile(np.arange(10), 5)]) df = DataFrame(np.random.randn(len(index), 5), index=index) for meth in ['var', 'std']: ddof = 4 alt = lambda x: getattr(x, meth)(ddof=ddof) result = getattr(df[0], meth)(level=0, ddof=ddof) expected = df[0].groupby(level=0).agg(alt) assert_series_equal(result, expected) result = getattr(df, meth)(level=0, ddof=ddof) expected = df.groupby(level=0).agg(alt) assert_frame_equal(result, expected) def test_frame_series_agg_multiple_levels(self): result = self.ymd.sum(level=['year', 'month']) expected = self.ymd.groupby(level=['year', 'month']).sum() assert_frame_equal(result, expected) result = self.ymd['A'].sum(level=['year', 'month']) expected = self.ymd['A'].groupby(level=['year', 'month']).sum() assert_series_equal(result, expected) def test_groupby_multilevel(self): result = self.ymd.groupby(level=[0, 1]).mean() k1 = self.ymd.index.get_level_values(0) k2 = self.ymd.index.get_level_values(1) expected = self.ymd.groupby([k1, k2]).mean() assert_frame_equal(result, expected) self.assertEquals(result.index.names, self.ymd.index.names[:2]) result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() assert_frame_equal(result, result2) def test_groupby_multilevel_with_transform(self): pass def test_multilevel_consolidate(self): index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), ('bar', 'one'), ('bar', 'two')]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df['Totals', ''] = df.sum(1) df = df.consolidate() def test_ix_preserve_names(self): result = self.ymd.ix[2000] result2 = self.ymd['A'].ix[2000] self.assertEquals(result.index.names, self.ymd.index.names[1:]) self.assertEquals(result2.index.names, self.ymd.index.names[1:]) result = self.ymd.ix[2000, 2] result2 = self.ymd['A'].ix[2000, 2] self.assertEquals(result.index.name, self.ymd.index.names[2]) self.assertEquals(result2.index.name, self.ymd.index.names[2]) def test_partial_set(self): # GH #397 df = self.ymd.copy() exp = self.ymd.copy() df.ix[2000, 4] = 0 exp.ix[2000, 4].values[:] = 0 assert_frame_equal(df, exp) df['A'].ix[2000, 4] = 1 exp['A'].ix[2000, 4].values[:] = 1 assert_frame_equal(df, exp) df.ix[2000] = 5 exp.ix[2000].values[:] = 5 assert_frame_equal(df, exp) # this works...for now df['A'].ix[14] = 5 self.assertEquals(df['A'][14], 5) def test_unstack_preserve_types(self): # GH #403 self.ymd['E'] = 'foo' self.ymd['F'] = 2 unstacked = self.ymd.unstack('month') self.assert_(unstacked['A', 1].dtype == np.float64) self.assert_(unstacked['E', 1].dtype == np.object_) self.assert_(unstacked['F', 1].dtype == np.float64) def test_getitem_lowerdim_corner(self): self.assertRaises(KeyError, self.frame.ix.__getitem__, (('bar', 'three'), 'B')) self.assertRaises(KeyError, self.frame.ix.__setitem__, (('bar', 'three'), 'B'), 0) #---------------------------------------------------------------------- # AMBIGUOUS CASES! def test_partial_ix_missing(self): raise nose.SkipTest result = self.ymd.ix[2000, 0] expected = self.ymd.ix[2000]['A'] assert_series_equal(result, expected) # need to put in some work here # self.ymd.ix[2000, 0] = 0 # self.assert_((self.ymd.ix[2000]['A'] == 0).all()) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6)) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0) #---------------------------------------------------------------------- def test_to_html(self): self.ymd.columns.name = 'foo' self.ymd.to_html() self.ymd.T.to_html() def test_level_with_tuples(self): index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0), ('foo', 'qux', 0)], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[('foo', 'bar', 0)] result2 = series.ix[('foo', 'bar', 0)] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) self.assertRaises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2)) result = frame.ix[('foo', 'bar', 0)] result2 = frame.xs(('foo', 'bar', 0)) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), ('foo', 'qux')], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[('foo', 'bar')] result2 = series.ix[('foo', 'bar')] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) result = frame.ix[('foo', 'bar')] result2 = frame.xs(('foo', 'bar')) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_int_series_slicing(self): s = self.ymd['A'] result = s[5:] expected = s.reindex(s.index[5:]) assert_series_equal(result, expected) exp = self.ymd['A'].copy() s[5:] = 0 exp.values[5:] = 0 self.assert_(np.array_equal(s.values, exp.values)) result = self.ymd[5:] expected = self.ymd.reindex(s.index[5:]) assert_frame_equal(result, expected) def test_mixed_depth_get(self): arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], [ '', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4,6),columns = index) result = df['a'] expected = df['a','',''] assert_series_equal(result, expected) self.assertEquals(result.name, 'a') result = df['routine1','result1'] expected = df['routine1','result1',''] assert_series_equal(result, expected) self.assertEquals(result.name, ('routine1', 'result1')) def test_mixed_depth_insert(self): arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], [ '', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4,6),columns = index) result = df.copy() expected = df.copy() result['b'] = [1,2,3,4] expected['b','',''] = [1,2,3,4] assert_frame_equal(result, expected) def test_mixed_depth_drop(self): arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], [ '', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4,6),columns = index) result = df.drop('a',axis=1) expected = df.drop([('a','','')],axis=1) assert_frame_equal(expected, result) result = df.drop(['top'],axis=1) expected = df.drop([('top','OD','wx')], axis=1) expected = expected.drop([('top','OD','wy')], axis=1) assert_frame_equal(expected, result) result = df.drop(('top', 'OD', 'wx'), axis=1) expected = df.drop([('top','OD','wx')], axis=1) assert_frame_equal(expected, result) expected = df.drop([('top','OD','wy')], axis=1) expected = df.drop('top', axis=1) result = df.drop('result1', level=1, axis=1) expected = df.drop([('routine1', 'result1', ''), ('routine2', 'result1', '')], axis=1) assert_frame_equal(expected, result) def test_mixed_depth_pop(self): arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], [ '', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4,6),columns = index) df1 = df.copy() df2 = df.copy() result = df1.pop('a') expected = df2.pop(('a','','')) assert_series_equal(expected, result) assert_frame_equal(df1, df2) self.assertEquals(result.name,'a') expected = df1['top'] df1 = df1.drop(['top'],axis=1) result = df2.pop('top') assert_frame_equal(expected, result) assert_frame_equal(df1, df2) def test_reindex_level_partial_selection(self): result = self.frame.reindex(['foo', 'qux'], level=0) expected = self.frame.ix[[0, 1, 2, 7, 8, 9]] assert_frame_equal(result, expected) result = self.frame.T.reindex_axis(['foo', 'qux'], axis=1, level=0) assert_frame_equal(result, expected.T) result = self.frame.ix[['foo', 'qux']] assert_frame_equal(result, expected) result = self.frame['A'].ix[['foo', 'qux']] assert_series_equal(result, expected['A']) result = self.frame.T.ix[:, ['foo', 'qux']] assert_frame_equal(result, expected.T) def test_setitem_multiple_partial(self): expected = self.frame.copy() result = self.frame.copy() result.ix[['foo', 'bar']] = 0 expected.ix['foo'] = 0 expected.ix['bar'] = 0 assert_frame_equal(result, expected) expected = self.frame.copy() result = self.frame.copy() result.ix['foo':'bar'] = 0 expected.ix['foo'] = 0 expected.ix['bar'] = 0 assert_frame_equal(result, expected) expected = self.frame['A'].copy() result = self.frame['A'].copy() result.ix[['foo', 'bar']] = 0 expected.ix['foo'] = 0 expected.ix['bar'] = 0 assert_series_equal(result, expected) expected = self.frame['A'].copy() result = self.frame['A'].copy() result.ix['foo':'bar'] = 0 expected.ix['foo'] = 0 expected.ix['bar'] = 0 assert_series_equal(result, expected) def test_drop_level(self): result = self.frame.drop(['bar', 'qux'], level='first') expected = self.frame.ix[[0, 1, 2, 5, 6]] assert_frame_equal(result, expected) result = self.frame.drop(['two'], level='second') expected = self.frame.ix[[0, 2, 3, 6, 7, 9]] assert_frame_equal(result, expected) result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first') expected = self.frame.ix[[0, 1, 2, 5, 6]].T assert_frame_equal(result, expected) result = self.frame.T.drop(['two'], axis=1, level='second') expected = self.frame.ix[[0, 2, 3, 6, 7, 9]].T assert_frame_equal(result, expected) def test_unicode_repr_issues(self): levels = [Index([u'a/\u03c3', u'b/\u03c3',u'c/\u03c3']), Index([0, 1])] labels = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] index = MultiIndex(levels=levels, labels=labels) repr(index.levels)
def create_fip(temporary_store = None, year = None): assert temporary_store is not None assert year is not None # fip : fichier d'imposition des personnes """ Creates a 'fipDat' table containing all these 'fip individuals' """ # Some individuals are declared as 'personne à charge' (pac) on 'tax forms' # but are not present in the erf or eec tables. # We add them to ensure consistency between concepts. year_specific_by_generic = year_specific_by_generic_data_frame_name(year) erfs_survey_collection = SurveyCollection.load( collection = 'erfs', config_files_directory = config_files_directory) survey = erfs_survey_collection.get_survey('erfs_{}'.format(year)) log.info(u"Démarrage de 03_fip") # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = survey.get_values(table = year_specific_by_generic["foyer"], variables = erfFoyVar) foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True) log.info(u"Etape 1 : on récupere les personnes à charge des foyers") log.info(u" 1.1 : Création des codes des enfants") foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5 log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max)) # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max) nb_pac_max = int(nb_pac_max) for i in range(1, nb_pac_max + 1): pac_tuples_list = [ (i, 'declaration'), (i, 'type_pac'), (i, 'naia') ] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples( multi_index_columns, names = ['pac_number', 'variable'] ) fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns) for i in range(1, nb_pac_max + 1): # TODO: using values to deal with mismatching indexes fip[(i, 'declaration')] = foyer['declar'].values fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values fip = fip.stack("pac_number") fip.reset_index(inplace = True) fip.drop(['level_0'], axis = 1, inplace = True) log.info(u" 1.2 : elimination des foyers fiscaux sans pac") # Clearing missing values and changing data format fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy() fip = fip.sort(columns = ['declaration', 'naia', 'type_pac']) fip.set_index(["declaration", "pac_number"], inplace = True) fip = fip.reset_index() fip.drop(['pac_number'], axis = 1, inplace = True) assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), \ "Certains types de PAC ne sont pas des cases connues" # control(fip, debug=True, verbose=True, verbose_columns=['naia']) log.info(u" 1.3 : on enlève les individus F pour lesquels il existe un individu G") type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy() # Filtre pour ne travailler que sur F & G type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True) type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin'] # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux # puis on retire les autres (à la fois F et G) fip['to_keep'] = np.nan fip.update(type_FG) log.info(u" 1.4 : on enlève les H pour lesquels il y a un I") type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy() type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True) type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac']) type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values fip.update(type_HI) fip['to_keep'] = fip['to_keep'].fillna(True) log.info(u"{} F, G, H or I non redundant pac kept over {} potential candidates".format( fip['to_keep'].sum(), len(fip)) ) indivifip = fip[fip['to_keep']].copy() del indivifip['to_keep'], fip, type_FG, type_HI # # control(indivifip, debug=True) log.info(u"Step 2 : matching indivifip with eec file") indivi = temporary_store['indivim_{}'.format(year)] pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy() assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia" # For safety enforce pac.naia and indivifip.naia dtypes pac['naia'] = pac.naia.astype('int32') indivifip['naia'] = indivifip.naia.astype('int32') pac['key1'] = zip(pac.naia, pac['declar1'].str[:29]) pac['key2'] = zip(pac.naia, pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values) assert pac.naia.dtype == indivifip.naia.dtype, \ "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype) fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy() fip = fip[~(fip.key.isin(pac.key2.values))].copy() log.info(u" 2.1 new fip created") # We build a dataframe to link the pac to their type and noindiv tmp_pac1 = pac[['noindiv', 'key1']].copy() tmp_pac2 = pac[['noindiv', 'key2']].copy() tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy() pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') log.info(u"{} pac dans les 1ères déclarations".format(len(pac_ind1))) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') log.info(u"{} pac dans les 2èms déclarations".format(len(pac_ind2))) log.info("{} duplicated pac_ind1".format(pac_ind1.duplicated().sum())) log.info("{} duplicated pac_ind2".format(pac_ind2.duplicated().sum())) del pac_ind1['key1'], pac_ind2['key2'] if len(pac_ind1.index) == 0: if len(pac_ind2.index) == 0: log.info(u"Warning : no link between pac and noindiv for both pacInd1&2") else: log.info(u"Warning : pacInd1 is an empty data frame") pacInd = pac_ind2 elif len(pac_ind2.index) == 0: log.info(u"Warning : pacInd2 is an empty data frame") pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) assert len(pac_ind1) + len(pac_ind2) == len(pacInd) log.info("{} null pac_ind2.type_pac".format(pac_ind2.type_pac.isnull().sum())) log.info("pacInd.type_pac.value_counts()) \n {}".format(pacInd.type_pac.value_counts(dropna = False))) log.info(u" 2.2 : pacInd created") log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum())) log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum())) log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum())) del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy() # pacIndiv.reset_index(inplace=True) log.info("{}".format(pacIndiv.columns)) temporary_store['pacIndiv_{}'.format(year)] = pacIndiv log.info("{}".format(pacIndiv.type_pac.value_counts())) gc.collect() # We keep the fip in the menage of their parents because it is used in to # build the famille. We should build an individual ident (ménage) for the fip that are # older than 18 since they are not in their parents' menage according to the eec log.info("{}".format(indivi['declar1'].str[0:2].value_counts())) log.info("{}".format(indivi['declar1'].str[0:2].describe())) log.info("{}".format(indivi['declar1'].str[0:2].notnull().all())) log.info("{}".format(indivi.info())) selection = indivi['declar1'].str[0:2] != "" indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")] individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy() individec1 = individec1.rename(columns = {'declar1': 'declaration'}) fip1 = fip.merge(individec1, on = 'declaration') log.info(u" 2.3 : fip1 created") individec2 = indivi.loc[ (indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous"), ["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"] ].copy() individec2.rename(columns = {'declar2': 'declaration'}, inplace = True) fip2 = fip.merge(individec2) log.info(u" 2.4 : fip2 created") fip1.duplicated().value_counts() fip2.duplicated().value_counts() fip = concat([fip1, fip2]) fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'].copy() fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'].copy() fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip.naia.astype('float') fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4 fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5 # TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ # TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis: clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi', 'ident']] while any(fip.duplicated(subset = ['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] log.info("{}".format(len(tmp))) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100 * fip['ident'] + fip['noidec'] fip['noindiv'] = 100 * fip['ident'] + fip['noi'] fip['type_pac'] = 0 fip['key'] = 0 log.info("Number of duplicated fip: {}".format(fip.duplicated('noindiv').value_counts())) temporary_store['fipDat_{}'.format(year)] = fip del fip, fip1, individec1, indivifip, indivi, pac log.info(u"fip sauvegardé")
class TestMultiLevel(unittest.TestCase): def setUp(self): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp")) self.single_level = MultiIndex(levels=[["foo", "bar", "baz", "qux"]], labels=[[0, 1, 2, 3]], names=["first"]) # create test series object arrays = [ ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) s[3] = np.NaN self.series = s tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work self.ymd.index.levels = [lev.astype("i8") for lev in self.ymd.index.levels] self.ymd.index.names = ["year", "month", "day"] def test_append(self): a, b = self.frame[:5], self.frame[5:] result = a.append(b) tm.assert_frame_equal(result, self.frame) result = a["A"].append(b["A"]) tm.assert_series_equal(result, self.frame["A"]) def test_reindex_level(self): # axis=0 month_sums = self.ymd.sum(level="month") result = month_sums.reindex(self.ymd.index, level=1) expected = self.ymd.groupby(level="month").transform(np.sum) assert_frame_equal(result, expected) # Series result = month_sums["A"].reindex(self.ymd.index, level=1) expected = self.ymd["A"].groupby(level="month").transform(np.sum) assert_series_equal(result, expected) # axis=1 month_sums = self.ymd.T.sum(axis=1, level="month") result = month_sums.reindex(columns=self.ymd.index, level=1) expected = self.ymd.groupby(level="month").transform(np.sum).T assert_frame_equal(result, expected) def test_binops_level(self): def _check_op(opname): op = getattr(DataFrame, opname) month_sums = self.ymd.sum(level="month") result = op(self.ymd, month_sums, level="month") broadcasted = self.ymd.groupby(level="month").transform(np.sum) expected = op(self.ymd, broadcasted) assert_frame_equal(result, expected) # Series op = getattr(Series, opname) result = op(self.ymd["A"], month_sums["A"], level="month") broadcasted = self.ymd["A"].groupby(level="month").transform(np.sum) expected = op(self.ymd["A"], broadcasted) assert_series_equal(result, expected) _check_op("sub") _check_op("add") _check_op("mul") _check_op("div") def test_pickle(self): import cPickle def _test_roundtrip(frame): pickled = cPickle.dumps(frame) unpickled = cPickle.loads(pickled) assert_frame_equal(frame, unpickled) _test_roundtrip(self.frame) _test_roundtrip(self.frame.T) _test_roundtrip(self.ymd) _test_roundtrip(self.ymd.T) def test_reindex(self): reindexed = self.frame.ix[[("foo", "one"), ("bar", "one")]] expected = self.frame.ix[[0, 3]] assert_frame_equal(reindexed, expected) def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) self.assert_(chunk.index is new_index) chunk = self.ymd.ix[new_index] self.assert_(chunk.index is new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) self.assert_(chunk.columns is new_index) chunk = ymdT.ix[:, new_index] self.assert_(chunk.columns is new_index) def test_sort_index_preserve_levels(self): result = self.frame.sort_index() self.assertEquals(result.index.names, self.frame.index.names) def test_repr_to_string(self): repr(self.frame) repr(self.ymd) repr(self.frame.T) repr(self.ymd.T) buf = StringIO() self.frame.to_string(buf=buf) self.ymd.to_string(buf=buf) self.frame.T.to_string(buf=buf) self.ymd.T.to_string(buf=buf) def test_getitem_simple(self): df = self.frame.T col = df["foo", "one"] assert_almost_equal(col.values, df.values[:, 0]) self.assertRaises(KeyError, df.__getitem__, ("foo", "four")) self.assertRaises(KeyError, df.__getitem__, "foobar") def test_series_getitem(self): s = self.ymd["A"] result = s[2000, 3] result2 = s.ix[2000, 3] expected = s.reindex(s.index[42:65]) expected.index = expected.index.droplevel(0).droplevel(0) assert_series_equal(result, expected) result = s[2000, 3, 10] expected = s[49] self.assertEquals(result, expected) # fancy result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] expected = s.reindex(s.index[49:51]) assert_series_equal(result, expected) # key error self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) def test_series_getitem_corner(self): s = self.ymd["A"] # don't segfault, GH #495 # out of bounds access self.assertRaises(IndexError, s.__getitem__, len(self.ymd)) # generator result = s[(x > 0 for x in s)] expected = s[s > 0] assert_series_equal(result, expected) def test_series_setitem(self): s = self.ymd["A"] s[2000, 3] = np.nan self.assert_(isnull(s.values[42:65]).all()) self.assert_(notnull(s.values[:42]).all()) self.assert_(notnull(s.values[65:]).all()) s[2000, 3, 10] = np.nan self.assert_(isnull(s[49])) def test_series_slice_partial(self): pass def test_frame_getitem_setitem_slice(self): # getitem result = self.frame.ix[:4] expected = self.frame[:4] assert_frame_equal(result, expected) # setitem cp = self.frame.copy() cp.ix[:4] = 0 self.assert_((cp.values[:4] == 0).all()) self.assert_((cp.values[4:] != 0).all()) def test_frame_getitem_setitem_multislice(self): levels = [["t1", "t2"], ["a", "b", "c"]] labels = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] midx = MultiIndex(labels=labels, levels=levels, names=[None, "id"]) df = DataFrame({"value": [1, 2, 3, 7, 8]}, index=midx) result = df.ix[:, "value"] assert_series_equal(df["value"], result) result = df.ix[1:3, "value"] assert_series_equal(df["value"][1:3], result) result = df.ix[:, :] assert_frame_equal(df, result) result = df df.ix[:, "value"] = 10 result["value"] = 10 assert_frame_equal(df, result) df.ix[:, :] = 10 assert_frame_equal(df, result) def test_getitem_tuple_plus_slice(self): # GH #671 df = DataFrame({"a": range(10), "b": range(10), "c": np.random.randn(10), "d": np.random.randn(10)}) idf = df.set_index(["a", "b"]) result = idf.ix[(0, 0), :] expected = idf.ix[0, 0] expected2 = idf.xs((0, 0)) assert_series_equal(result, expected) assert_series_equal(result, expected2) def test_xs(self): xs = self.frame.xs(("bar", "two")) xs2 = self.frame.ix[("bar", "two")] assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) def test_xs_partial(self): result = self.frame.xs("foo") result2 = self.frame.ix["foo"] expected = self.frame.T["foo"].T assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_xs_level(self): result = self.frame.xs("two", level="second") expected = self.frame[self.frame.index.get_level_values(1) == "two"] expected.index = expected.index.droplevel(1) assert_frame_equal(result, expected) index = MultiIndex.from_tuples([("x", "y", "z"), ("a", "b", "c"), ("p", "q", "r")]) df = DataFrame(np.random.randn(3, 5), index=index) result = df.xs("c", level=2) expected = df[1:2] expected.index = expected.index.droplevel(2) assert_frame_equal(result, expected) def test_xs_level_multiple(self): from pandas import read_table from StringIO import StringIO text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep="\s+") result = df.xs(("a", 4), level=["one", "four"]) expected = df.xs("a").xs(4, level="four") assert_frame_equal(result, expected) def test_xs_level0(self): from pandas import read_table from StringIO import StringIO text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep="\s+") result = df.xs("a", level=0) expected = df.xs("a") self.assertEqual(len(result), 2) assert_frame_equal(result, expected) def test_xs_level_series(self): s = self.frame["A"] result = s[:, "two"] expected = self.frame.xs("two", level=1)["A"] assert_series_equal(result, expected) s = self.ymd["A"] result = s[2000, 5] expected = self.ymd.ix[2000, 5]["A"] assert_series_equal(result, expected) # not implementing this for now self.assertRaises(TypeError, s.__getitem__, (2000, slice(3, 4))) # result = s[2000, 3:4] # lv =s.index.get_level_values(1) # expected = s[(lv == 3) | (lv == 4)] # expected.index = expected.index.droplevel(0) # assert_series_equal(result, expected) # can do this though def test_get_loc_single_level(self): s = Series(np.random.randn(len(self.single_level)), index=self.single_level) for k in self.single_level.values: s[k] def test_getitem_toplevel(self): df = self.frame.T result = df["foo"] expected = df.reindex(columns=df.columns[:3]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) result = df["bar"] result2 = df.ix[:, "bar"] expected = df.reindex(columns=df.columns[3:5]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result, result2) def test_getitem_setitem_slice_integers(self): index = MultiIndex(levels=[[0, 1, 2], [0, 2]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) frame = DataFrame(np.random.randn(len(index), 4), index=index, columns=["a", "b", "c", "d"]) res = frame.ix[1:2] exp = frame.reindex(frame.index[2:]) assert_frame_equal(res, exp) frame.ix[1:2] = 7 self.assert_((frame.ix[1:2] == 7).values.all()) series = Series(np.random.randn(len(index)), index=index) res = series.ix[1:2] exp = series.reindex(series.index[2:]) assert_series_equal(res, exp) series.ix[1:2] = 7 self.assert_((series.ix[1:2] == 7).values.all()) def test_getitem_int(self): levels = [[0, 1], [0, 1, 2]] labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] index = MultiIndex(levels=levels, labels=labels) frame = DataFrame(np.random.randn(6, 2), index=index) result = frame.ix[1] expected = frame[-3:] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) # raises exception self.assertRaises(KeyError, frame.ix.__getitem__, 3) # however this will work result = self.frame.ix[2] expected = self.frame.xs(self.frame.index[2]) assert_series_equal(result, expected) def test_getitem_partial(self): ymd = self.ymd.T result = ymd[2000, 2] expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) expected.columns = expected.columns.droplevel(0).droplevel(0) assert_frame_equal(result, expected) def test_getitem_slice_not_sorted(self): df = self.frame.sortlevel(1).T # buglet with int typechecking result = df.ix[:, : np.int32(3)] expected = df.reindex(columns=df.columns[:3]) assert_frame_equal(result, expected) def test_setitem_change_dtype(self): dft = self.frame.T s = dft["foo", "two"] dft["foo", "two"] = s > s.median() assert_series_equal(dft["foo", "two"], s > s.median()) self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) reindexed = dft.reindex(columns=[("foo", "two")]) assert_series_equal(reindexed["foo", "two"], s > s.median()) def test_frame_setitem_ix(self): self.frame.ix[("bar", "two"), "B"] = 5 self.assertEquals(self.frame.ix[("bar", "two"), "B"], 5) # with integer labels df = self.frame.copy() df.columns = range(3) df.ix[("bar", "two"), 1] = 7 self.assertEquals(df.ix[("bar", "two"), 1], 7) def test_fancy_slice_partial(self): result = self.frame.ix["bar":"baz"] expected = self.frame[3:7] assert_frame_equal(result, expected) result = self.ymd.ix[(2000, 2):(2000, 4)] lev = self.ymd.index.labels[1] expected = self.ymd[(lev >= 1) & (lev <= 3)] assert_frame_equal(result, expected) def test_sortlevel(self): df = self.frame.copy() df.index = np.arange(len(df)) self.assertRaises(Exception, df.sortlevel, 0) # axis=1 # series a_sorted = self.frame["A"].sortlevel(0) self.assertRaises(Exception, self.frame.reset_index()["A"].sortlevel) # preserve names self.assertEquals(a_sorted.index.names, self.frame.index.names) def test_delevel_infer_dtype(self): tuples = [tuple for tuple in cart_product(["foo", "bar"], [10, 20], [1.0, 1.1])] index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) deleveled = df.reset_index() self.assert_(com.is_integer_dtype(deleveled["prm1"])) self.assert_(com.is_float_dtype(deleveled["prm2"])) def test_reset_index_with_drop(self): deleveled = self.ymd.reset_index(drop=True) self.assertEquals(len(deleveled.columns), len(self.ymd.columns)) deleveled = self.series.reset_index() self.assert_(isinstance(deleveled, DataFrame)) self.assert_(len(deleveled.columns) == len(self.series.index.levels) + 1) deleveled = self.series.reset_index(drop=True) self.assert_(isinstance(deleveled, Series)) def test_sortlevel_by_name(self): self.frame.index.names = ["first", "second"] result = self.frame.sortlevel(level="second") expected = self.frame.sortlevel(level=1) assert_frame_equal(result, expected) def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) df = self.frame.copy() df["foo"] = "bar" sorted_after = df.sortlevel(1) assert_frame_equal(sorted_before, sorted_after.drop(["foo"], axis=1)) dft = self.frame.T sorted_before = dft.sortlevel(1, axis=1) dft["foo", "three"] = "bar" sorted_after = dft.sortlevel(1, axis=1) assert_frame_equal( sorted_before.drop([("foo", "three")], axis=1), sorted_after.drop([("foo", "three")], axis=1) ) def test_count_level(self): def _check_counts(frame, axis=0): index = frame._get_axis(axis) for i in range(index.nlevels): result = frame.count(axis=axis, level=i) expected = frame.groupby(axis=axis, level=i).count(axis=axis) expected = expected.reindex_like(result).astype("i8") assert_frame_equal(result, expected) self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan self.ymd.ix[1, [1, 2]] = np.nan self.ymd.ix[7, [0, 1]] = np.nan _check_counts(self.frame) _check_counts(self.ymd) _check_counts(self.frame.T, axis=1) _check_counts(self.ymd.T, axis=1) # can't call with level on regular DataFrame df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) self.frame["D"] = "foo" result = self.frame.count(level=0, numeric_only=True) assert_almost_equal(result.columns, ["A", "B", "C"]) def test_count_level_series(self): index = MultiIndex( levels=[["foo", "bar", "baz"], ["one", "two", "three", "four"]], labels=[[0, 0, 0, 2, 2], [2, 0, 1, 1, 2]] ) s = Series(np.random.randn(len(index)), index=index) result = s.count(level=0) expected = s.groupby(level=0).count() assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0)) result = s.count(level=1) expected = s.groupby(level=1).count() assert_series_equal(result.astype("f8"), expected.reindex(result.index).fillna(0)) def test_count_level_corner(self): s = self.frame["A"][:0] result = s.count(level=0) expected = Series(0, index=s.index.levels[0]) assert_series_equal(result, expected) df = self.frame[:0] result = df.count(level=0) expected = DataFrame({}, index=s.index.levels[0], columns=df.columns).fillna(0).astype(int) assert_frame_equal(result, expected) def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() unstacked2 = unstacked.unstack() # test that ints work unstacked = self.ymd.astype(int).unstack() def test_stack(self): # regular roundtrip unstacked = self.ymd.unstack() restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) unlexsorted = self.ymd.sortlevel(2) unstacked = unlexsorted.unstack(2) restacked = unstacked.stack() assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted[::-1] unstacked = unlexsorted.unstack(1) restacked = unstacked.stack().swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) unlexsorted = unlexsorted.swaplevel(0, 1) unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) restacked = unstacked.stack(0).swaplevel(1, 2) assert_frame_equal(restacked.sortlevel(0), self.ymd) # columns unsorted unstacked = self.ymd.unstack() unstacked = unstacked.sort(axis=1, ascending=False) restacked = unstacked.stack() assert_frame_equal(restacked, self.ymd) # more than 2 levels in the columns unstacked = self.ymd.unstack(1).unstack(1) result = unstacked.stack(1) expected = self.ymd.unstack() assert_frame_equal(result, expected) result = unstacked.stack(2) expected = self.ymd.unstack(1) assert_frame_equal(result, expected) result = unstacked.stack(0) expected = self.ymd.stack().unstack(1).unstack(1) assert_frame_equal(result, expected) # not all levels present in each echelon unstacked = self.ymd.unstack(2).ix[:, ::3] stacked = unstacked.stack().stack() ymd_stacked = self.ymd.stack() assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) # stack with negative number result = self.ymd.unstack(0).stack(-2) expected = self.ymd.unstack(0).stack(0) def test_stack_mixed_dtype(self): df = self.frame.T df["foo", "four"] = "foo" df = df.sortlevel(1, axis=1) stacked = df.stack() assert_series_equal(stacked["foo"], df["foo"].stack()) self.assert_(stacked["bar"].dtype == np.float_) def test_unstack_bug(self): df = DataFrame( { "state": ["naive", "naive", "naive", "activ", "activ", "activ"], "exp": ["a", "b", "b", "b", "a", "a"], "barcode": [1, 2, 3, 4, 1, 3], "v": ["hi", "hi", "bye", "bye", "bye", "peace"], "extra": np.arange(6.0), } ) result = df.groupby(["state", "exp", "barcode", "v"]).apply(len) unstacked = result.unstack() restacked = unstacked.stack() assert_series_equal(restacked, result.reindex(restacked.index).astype(float)) def test_stack_unstack_preserve_names(self): unstacked = self.frame.unstack() self.assertEquals(unstacked.index.name, "first") self.assertEquals(unstacked.columns.names, ["exp", "second"]) restacked = unstacked.stack() self.assertEquals(restacked.index.names, self.frame.index.names) def test_unstack_level_name(self): result = self.frame.unstack("second") expected = self.frame.unstack(level=1) assert_frame_equal(result, expected) def test_stack_level_name(self): unstacked = self.frame.unstack("second") result = unstacked.stack("exp") expected = self.frame.unstack().stack(0) assert_frame_equal(result, expected) result = self.frame.stack("exp") expected = self.frame.stack() assert_series_equal(result, expected) def test_stack_unstack_multiple(self): unstacked = self.ymd.unstack(["year", "month"]) expected = self.ymd.unstack("year").unstack("month") assert_frame_equal(unstacked, expected) self.assertEquals(unstacked.columns.names, expected.columns.names) # series s = self.ymd["A"] s_unstacked = s.unstack(["year", "month"]) assert_frame_equal(s_unstacked, expected["A"]) restacked = unstacked.stack(["year", "month"]) restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) restacked = restacked.sortlevel(0) assert_frame_equal(restacked, self.ymd) self.assertEquals(restacked.index.names, self.ymd.index.names) # GH #451 unstacked = self.ymd.unstack([1, 2]) expected = self.ymd.unstack(1).unstack(1) assert_frame_equal(unstacked, expected) unstacked = self.ymd.unstack([2, 1]) expected = self.ymd.unstack(2).unstack(1) assert_frame_equal(unstacked, expected) def test_groupby_transform(self): s = self.frame["A"] grouper = s.index.get_level_values(0) grouped = s.groupby(grouper) applied = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) assert_series_equal(applied.reindex(expected.index), expected) def test_groupby_corner(self): midx = MultiIndex(levels=[["foo"], ["bar"], ["baz"]], labels=[[0], [0], [0]], names=["one", "two", "three"]) df = DataFrame([np.random.rand(4)], columns=["a", "b", "c", "d"], index=midx) # should work df.groupby(level="three") def test_join(self): a = self.frame.ix[:5, ["A"]] b = self.frame.ix[2:, ["B", "C"]] joined = a.join(b, how="outer").reindex(self.frame.index) expected = self.frame.copy() expected.values[np.isnan(joined.values)] = np.nan self.assert_(not np.isnan(joined.values).all()) assert_frame_equal(joined, expected) def test_swaplevel(self): swapped = self.frame["A"].swaplevel(0, 1) swapped2 = self.frame["A"].swaplevel("first", "second") self.assert_(not swapped.index.equals(self.frame.index)) assert_series_equal(swapped, swapped2) back = swapped.swaplevel(0, 1) back2 = swapped.swaplevel("second", "first") self.assert_(back.index.equals(self.frame.index)) assert_series_equal(back, back2) ft = self.frame.T swapped = ft.swaplevel("first", "second", axis=1) exp = self.frame.swaplevel("first", "second").T assert_frame_equal(swapped, exp) def test_swaplevel_panel(self): panel = Panel({"ItemA": self.frame, "ItemB": self.frame * 2}) result = panel.swaplevel(0, 1, axis="major") expected = panel.copy() expected.major_axis = expected.major_axis.swaplevel(0, 1) tm.assert_panel_equal(result, expected) def test_reorder_levels(self): result = self.ymd.reorder_levels(["month", "day", "year"]) expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) assert_frame_equal(result, expected) result = self.ymd["A"].reorder_levels(["month", "day", "year"]) expected = self.ymd["A"].swaplevel(0, 1).swaplevel(1, 2) assert_series_equal(result, expected) result = self.ymd.T.reorder_levels(["month", "day", "year"], axis=1) expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) assert_frame_equal(result, expected) self.assertRaises(Exception, self.ymd.index.reorder_levels, [1, 2, 3]) def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] self.assert_(isinstance(df.columns, MultiIndex)) self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all()) def test_alignment(self): x = Series(data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)])) y = Series(data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)])) res = x - y exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) # hit non-monotonic code path res = x[::-1] - y[::-1] exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) def test_is_lexsorted(self): levels = [[0, 1], [0, 1, 2]] index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]]) self.assert_(index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]]) self.assert_(not index.is_lexsorted()) index = MultiIndex(levels=levels, labels=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]]) self.assert_(not index.is_lexsorted()) self.assert_(index.lexsort_depth == 0) def test_frame_getitem_view(self): df = self.frame.T df["foo"].values[:] = 0 self.assert_((df["foo"].values == 0).all()) # but not if it's mixed-type df["foo", "four"] = "foo" df = df.sortlevel(0, axis=1) df["foo"]["one"] = 2 self.assert_((df["foo", "one"] == 0).all()) def test_frame_getitem_not_sorted(self): df = self.frame.T df["foo", "four"] = "foo" arrays = [np.array(x) for x in zip(*df.columns.get_tuple_index())] result = df["foo"] result2 = df.ix[:, "foo"] expected = df.reindex(columns=df.columns[arrays[0] == "foo"]) expected.columns = expected.columns.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) df = df.T result = df.xs("foo") result2 = df.ix["foo"] expected = df.reindex(df.index[arrays[0] == "foo"]) expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_series_getitem_not_sorted(self): arrays = [ ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) arrays = [np.array(x) for x in zip(*index.get_tuple_index())] result = s["qux"] result2 = s.ix["qux"] expected = s[arrays[0] == "qux"] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) def test_count(self): frame = self.frame.copy() frame.index.names = ["a", "b"] result = frame.count(level="b") expect = self.frame.count(level=1) assert_frame_equal(result, expect) result = frame.count(level="a") expect = self.frame.count(level=0) assert_frame_equal(result, expect) series = self.series.copy() series.index.names = ["a", "b"] result = series.count(level="b") expect = self.series.count(level=1) assert_series_equal(result, expect) result = series.count(level="a") expect = self.series.count(level=0) assert_series_equal(result, expect) self.assertRaises(Exception, series.count, "x") self.assertRaises(Exception, frame.count, level="x") AGG_FUNCTIONS = ["sum", "prod", "min", "max", "median", "mean", "skew", "mad", "std", "var"] def test_series_group_min_max(self): for op, level, skipna in cart_product(self.AGG_FUNCTIONS, range(2), [False, True]): grouped = self.series.groupby(level=level) aggf = lambda x: getattr(x, op)(skipna=skipna) # skipna=True leftside = grouped.agg(aggf) rightside = getattr(self.series, op)(level=level, skipna=skipna) assert_series_equal(leftside, rightside) def test_frame_group_ops(self): self.frame.ix[1, [1, 2]] = np.nan self.frame.ix[7, [0, 1]] = np.nan for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, range(2), range(2), [False, True]): if axis == 0: frame = self.frame else: frame = self.frame.T grouped = frame.groupby(level=level, axis=axis) aggf = lambda x: getattr(x, op)(skipna=skipna, axis=axis) leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] self.assert_(leftside._get_axis(axis).equals(level_index)) self.assert_(rightside._get_axis(axis).equals(level_index)) assert_frame_equal(leftside, rightside) def test_frame_series_agg_multiple_levels(self): result = self.ymd.sum(level=["year", "month"]) expected = self.ymd.groupby(level=["year", "month"]).sum() assert_frame_equal(result, expected) result = self.ymd["A"].sum(level=["year", "month"]) expected = self.ymd["A"].groupby(level=["year", "month"]).sum() assert_series_equal(result, expected) def test_groupby_multilevel(self): result = self.ymd.groupby(level=[0, 1]).mean() k1 = self.ymd.index.get_level_values(0) k2 = self.ymd.index.get_level_values(1) expected = self.ymd.groupby([k1, k2]).mean() assert_frame_equal(result, expected) self.assertEquals(result.index.names, self.ymd.index.names[:2]) result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() assert_frame_equal(result, result2) def test_groupby_multilevel_with_transform(self): pass def test_multilevel_consolidate(self): index = MultiIndex.from_tuples([("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df["Totals", ""] = df.sum(1) df = df.consolidate() def test_ix_preserve_names(self): result = self.ymd.ix[2000] result2 = self.ymd["A"].ix[2000] self.assertEquals(result.index.names, self.ymd.index.names[1:]) self.assertEquals(result2.index.names, self.ymd.index.names[1:]) result = self.ymd.ix[2000, 2] result2 = self.ymd["A"].ix[2000, 2] self.assertEquals(result.index.name, self.ymd.index.names[2]) self.assertEquals(result2.index.name, self.ymd.index.names[2]) def test_partial_set(self): # GH #397 df = self.ymd.copy() exp = self.ymd.copy() df.ix[2000, 4] = 0 exp.ix[2000, 4].values[:] = 0 assert_frame_equal(df, exp) df["A"].ix[2000, 4] = 1 exp["A"].ix[2000, 4].values[:] = 1 assert_frame_equal(df, exp) df.ix[2000] = 5 exp.ix[2000].values[:] = 5 assert_frame_equal(df, exp) # this works...for now df["A"].ix[14] = 5 self.assertEquals(df["A"][14], 5) def test_unstack_preserve_types(self): # GH #403 self.ymd["E"] = "foo" self.ymd["F"] = 2 unstacked = self.ymd.unstack("month") self.assert_(unstacked["A", 1].dtype == np.float64) self.assert_(unstacked["E", 1].dtype == np.object_) self.assert_(unstacked["F", 1].dtype == np.float64) def test_getitem_lowerdim_corner(self): self.assertRaises(KeyError, self.frame.ix.__getitem__, (("bar", "three"), "B")) self.assertRaises(KeyError, self.frame.ix.__setitem__, (("bar", "three"), "B"), 0) # ---------------------------------------------------------------------- # AMBIGUOUS CASES! def test_partial_ix_missing(self): raise nose.SkipTest result = self.ymd.ix[2000, 0] expected = self.ymd.ix[2000]["A"] assert_series_equal(result, expected) # need to put in some work here # self.ymd.ix[2000, 0] = 0 # self.assert_((self.ymd.ix[2000]['A'] == 0).all()) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6)) self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0) def test_fancy_2d(self): raise nose.SkipTest result = self.frame.ix["foo", "B"] expected = self.frame.xs("foo")["B"] assert_series_equal(result, expected) ft = self.frame.T result = ft.ix["B", "foo"] expected = ft.xs("B")["foo"] assert_series_equal(result, expected) # ---------------------------------------------------------------------- def test_to_html(self): self.ymd.columns.name = "foo" self.ymd.to_html() self.ymd.T.to_html() def test_level_with_tuples(self): index = MultiIndex( levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], ) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[("foo", "bar", 0)] result2 = series.ix[("foo", "bar", 0)] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) self.assertRaises(KeyError, series.__getitem__, (("foo", "bar", 0), 2)) result = frame.ix[("foo", "bar", 0)] result2 = frame.xs(("foo", "bar", 0)) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) index = MultiIndex( levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]], labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], ) series = Series(np.random.randn(6), index=index) frame = DataFrame(np.random.randn(6, 4), index=index) result = series[("foo", "bar")] result2 = series.ix[("foo", "bar")] expected = series[:2] expected.index = expected.index.droplevel(0) assert_series_equal(result, expected) assert_series_equal(result2, expected) result = frame.ix[("foo", "bar")] result2 = frame.xs(("foo", "bar")) expected = frame[:2] expected.index = expected.index.droplevel(0) assert_frame_equal(result, expected) assert_frame_equal(result2, expected) def test_int_series_slicing(self): s = self.ymd["A"] result = s[5:] expected = s.reindex(s.index[5:]) assert_series_equal(result, expected) exp = self.ymd["A"].copy() s[5:] = 0 exp.values[5:] = 0 self.assert_(np.array_equal(s.values, exp.values)) result = self.ymd[5:] expected = self.ymd.reindex(s.index[5:]) assert_frame_equal(result, expected) def test_mixed_depth_get(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], ["", "OD", "OD", "result1", "result2", "result1"], ["", "wx", "wy", "", "", ""], ] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) result = df["a"] expected = df["a", "", ""] assert_series_equal(result, expected) self.assertEquals(result.name, "a") result = df["routine1", "result1"] expected = df["routine1", "result1", ""] assert_series_equal(result, expected) self.assertEquals(result.name, ("routine1", "result1")) def test_mixed_depth_insert(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], ["", "OD", "OD", "result1", "result2", "result1"], ["", "wx", "wy", "", "", ""], ] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) result = df.copy() expected = df.copy() result["b"] = [1, 2, 3, 4] expected["b", "", ""] = [1, 2, 3, 4] assert_frame_equal(result, expected) def test_mixed_depth_drop(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], ["", "OD", "OD", "result1", "result2", "result1"], ["", "wx", "wy", "", "", ""], ] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) result = df.drop("a", axis=1) expected = df.drop([("a", "", "")], axis=1) assert_frame_equal(expected, result) result = df.drop(["top"], axis=1) expected = df.drop([("top", "OD", "wx")], axis=1) expected = expected.drop([("top", "OD", "wy")], axis=1) assert_frame_equal(expected, result) def test_mixed_depth_pop(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], ["", "OD", "OD", "result1", "result2", "result1"], ["", "wx", "wy", "", "", ""], ] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) df1 = df.copy() df2 = df.copy() result = df1.pop("a") expected = df2.pop(("a", "", "")) assert_series_equal(expected, result) assert_frame_equal(df1, df2) self.assertEquals(result.name, "a") expected = df1["top"] df1 = df1.drop(["top"], axis=1) result = df2.pop("top") assert_frame_equal(expected, result) assert_frame_equal(df1, df2)
#casetovars varstocases d = {'one':[1,1],'two':[2,2]} i = ['a','b'] # Create dataframe df = DataFrame(data = d, index = i) df #varstocases df.stack() #casestoVars df.unstack() #aggregate d = {'one':[1,1,1,1,1],'two':[2,2,2,2,2],'letter':['a','a','b','b','c']} # Create dataframe df = DataFrame(d) df one = df.groupby('letter')
def create_fip(year = 2006): # message('03_fip') """ Creates a 'fipDat' table containing all these 'fip individuals' """ df = DataCollection(year=year) print 'Démarrer 03_fip' # # anaisenf: année de naissance des PAC # erfFoyVar <- c('anaisenf','declar') # foyer <- LoadIn(erfFoyFil) # foyer <- LoadIn(erfFoyFil,erfFoyVar) # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992') # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990) erfFoyVar = ['declar', 'anaisenf'] foyer = df.get_values(table="foyer", variables=erfFoyVar) print_id(foyer) # control(foyer, verbose=True, verbose_length=10, debug=True) # #*********************************************************************************************************** # # print "Step 1 : on recupere les personnes à charge des foyers" # #********************************************************************************************************** # # On traite les cas de declarations multiples pour ne pas créer de doublon de pac # # # # On récupère toutes les pac des foyers # L <- max(nchar(foyer$anaisenf))/5 # nombre de pac maximal # fip <-data.frame(declar = foyer$declar) # for (i in c(1:L)){ # eval(parse(text = paste('fip$typ.',as.character(i),'<- substr(foyer$anaisenf,5*(i-1)+1,5*(i-1)+1)',sep = ''))) # eval(parse(text = paste('fip$naia.',as.character(i),'<- as.numeric(substr(foyer$anaisenf,5*(i-1)+2,5*(i-1)+5))',sep = ''))) # } # fip <- fip[!is.na(fip$typ.1),] # fip <- reshape(fip,direction ='long', varying=2:17, sep=".") # fip <- fip[!is.na(fip$naia),] # fip <- fip[order(fip$declar,-rank(fip$typ),fip$naia),c('declar','naia','typ')] # fip$N <- row(fip)[,1] # str(fip$N) print "Etape 1 : on recupere les personnes à charge des foyers" print " 1.1 : Création des codes des enfants" foyer['anaisenf'] = foyer['anaisenf'].astype('string') nb_pac_max = len(max(foyer['anaisenf'], key=len))/5 print "il ya a au maximum %s pac par foyer" %nb_pac_max # Separating the string coding the pac of each "déclaration". # Creating a list containing the new variables. # Creating the multi_index for the columns multi_index_columns = [] for i in range(1, nb_pac_max + 1): pac_tuples_list = [(i, 'declaration'), (i, 'type_pac'), (i, 'naia')] multi_index_columns += pac_tuples_list columns = MultiIndex.from_tuples(multi_index_columns, names=['pac_number', 'variable']) fip = DataFrame(randn(len(foyer), 3*nb_pac_max), columns=columns) fip.fillna(NaN, inplace=True) # inutile a cause de la ligne précédente, to remove for i in range(1,nb_pac_max+1): fip[(i, 'declaration')] = foyer['declar'].values fip[(i,'type_pac')] = foyer['anaisenf'].str[5*(i-1)] fip[(i,'naia')] = foyer['anaisenf'].str[5*(i-1)+1:5*(i)] fip = fip.stack("pac_number") fip.reset_index(inplace=True) del fip["level_0"] # print fip.describe() # print fip.head().to_string() print " 1.2 : elimination des foyers fiscaux sans pac" #Clearing missing values and changing data format fip = fip[(fip['type_pac'].notnull()) & (fip['naia'] != 'an') & (fip['naia'] != '')] fip = fip.sort(columns=['declaration','naia','type_pac']) # TODO: check if useful fip.set_index(["declaration","pac_number"], inplace=True) fip = fip.reset_index() del fip['pac_number'] # control(fip, debug=True, verbose=True, verbose_columns=['naia']) print " 1.3 : on enlève les individus F pour lesquels il existe un individu G" tyFG = fip[fip.type_pac.isin(['F', 'G'])] #Filtre pour ne travailler que sur F & G tyFG['same_pair'] = tyFG.duplicated(cols=['declaration', 'naia'], take_last=True) tyFG['is_twin'] = tyFG.duplicated(cols=['declaration', 'naia', 'type_pac']) tyFG['to_keep'] = (~(tyFG['same_pair']) | (tyFG['is_twin'])) #Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux #puis on retire les autres (à la fois F et G) print len(tyFG),'/', len(tyFG[tyFG['to_keep']]) print 'longueur fip', len(fip) fip['to_keep'] = NaN fip.update(tyFG) print 'enfants F & G traités' print " 1.4 : on enlève les H pour lesquels il y a un I" tyHI = fip[fip.type_pac.isin(['H', 'I'])] tyHI['same_pair'] = tyHI.duplicated(cols=['declaration', 'naia'], take_last=True) tyHI['is_twin'] = tyHI.duplicated(cols=['declaration', 'naia', 'type_pac']) tyHI['to_keep'] = ~(tyHI['same_pair']) | (tyHI['is_twin']) fip.update(tyHI) fip['to_keep'] = fip['to_keep'].fillna(True) print 'nb lines to keep/nb initial lines' print len(fip[fip['to_keep']]), '/', len(fip) indivifip = fip[fip['to_keep']]; del indivifip['to_keep'], fip, tyFG, tyHI # control(indivifip, debug=True) # #************************************************************************************************************/ print '' print 'Step 2 : matching indivifip with eec file' # #************************************************************************************************************/ indivi = load_temp(name="indivim", year=year) #TODO: USE THIS INSTEAD OF PREVIOUS LINES # pac <- indivi[!is.na(indivi$persfip) & indivi$persfip == 'pac',] # pac$key1 <- paste(pac$naia,pac$declar1) # pac$key2 <- paste(pac$naia,pac$declar2) # indivifip$key <- paste(indivifip$naia,indivifip$declar) #TODO: replace Indivi['persfip'] is not NaN by indivi['persfip'].notnull() import pdb pdb.set_trace() pac = indivi[(indivi['persfip'] is not NaN) & (indivi['persfip']=='pac')] pac['naia'] = pac['naia'].astype('int32') # TODO: was float in pac fix upstream indivifip['naia'] = indivifip['naia'].astype('int32') pac['key1'] = zip(pac['naia'], pac['declar1'].str[:29]) pac['key2'] = zip(pac['naia'], pac['declar2'].str[:29]) indivifip['key'] = zip(indivifip['naia'], indivifip['declaration'].str[:29]) assert pac.naia.dtype == indivifip.naia.dtype, 'types %s , %s are different' %(pac.naia.dtype, indivifip.naia.dtype) # fip <- indivifip[!indivifip$key %in% pac$key1,] # fip <- fip[!fip$key %in% pac$key2,] fip = indivifip[~(indivifip.key.isin(pac.key1.values))] fip = fip[~(fip.key.isin(pac.key2.values))] print " 2.1 new fip created" # We build a dataframe to link the pac to their type and noindiv # table(duplicated(pac[,c("noindiv")])) countInd = pac.noindiv.value_counts() # pacInd1 <- merge(pac[,c("noindiv","key1","naia")], # indivifip[,c("key","typ")], by.x="key1", by.y="key") # pacInd2 <- merge(pac[,c("noindiv","key2","naia")], # indivifip[,c("key","typ")], by.x="key2", by.y="key") tmp_pac1 = pac[['noindiv', 'key1']] tmp_pac2 = pac[['noindiv', 'key2']] tmp_indivifip = indivifip[['key', 'type_pac', 'naia']] pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner') print 'longueur pacInd1' , len(pac_ind1) pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner') print 'longueur pacInd2', len(pac_ind2) print "pacInd1&2 créés" # table(duplicated(pacInd1)) # table(duplicated(pacInd2)) print pac_ind1.duplicated().sum() print pac_ind2.duplicated().sum() # pacInd1 <-rename(pacInd1,c("key1" = "key")) # pacInd2 <-rename(pacInd2,c("key2" = "key")) # pacInd <- rbind(pacInd1,pacInd2) # rm(pacInd1,pacInd2) # pacInd1.rename(columns={'key1':'key'}, inplace=True) # pacInd2.rename(columns={'key2':'key'}, inplace=True) del pac_ind1['key1'], pac_ind2['key2'] print pac_ind1.columns print pac_ind2.columns if pac_ind1.index == []: if pac_ind2.index == []: print "Warning : no link between pac and noindiv for both pacInd1&2" else: print "Warning : pacInd1 is an empty data frame" pacInd = pac_ind2 elif pac_ind2.index == []: print "Warning : pacInd2 is an empty data frame" pacInd = pac_ind1 else: pacInd = concat([pac_ind2, pac_ind1]) print len(pac_ind1), len(pac_ind2), len(pacInd) print pac_ind2.type_pac.isnull().sum() print pacInd.type_pac.value_counts() print ' 2.2 : pacInd created' # table(duplicated(pacInd[,c("noindiv","typ")])) # table(duplicated(pacInd$noindiv)) print 'doublons noindiv, type_pac', pacInd.duplicated(['noindiv', 'type_pac']).sum() print 'doublons noindiv seulement', pacInd.duplicated('noindiv').sum() print 'nb de NaN', pacInd.type_pac.isnull().sum() del pacInd["key"] pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))] # pacIndiv.reset_index(inplace=True) print pacIndiv.columns save_temp(pacIndiv, name="pacIndiv", year=year) print pacIndiv.type_pac.value_counts() gc.collect() # # We keep the fip in the menage of their parents because it is used in to # # build the famille. We should build an individual ident for the fip that are # # older than 18 since they are not in their parents' menage according to the eec # individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous")) # individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")] # individec1 <- upData(individec1,rename=c(declar1="declar")) # fip1 <- merge(fip,individec1) # indivi$noidec <- as.numeric(substr(indivi$declar1,1,2)) indivi['noidec'] = indivi['declar1'].str[0:2].astype('float16') # To be used later to set idfoy individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi['persfip']=="vous")] individec1 = individec1.loc[:, ["declar1","noidec","ident","rga","ztsai","ztsao"]] individec1 = individec1.rename(columns={'declar1':'declaration'}) fip1 = fip.merge(individec1, on='declaration') print ' 2.3 : fip1 created' # # TODO: On ne s'occupe pas des declar2 pour l'instant # # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous")) # # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")] # # individec2 <- upData(individec2,rename=c(declar2="declar")) # # fip2 <-merge(fip,individec2) individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip']=="vous")] individec2 = individec2.loc[:, ["declar2","noidec","ident","rga","ztsai","ztsao"]] individec2.rename(columns={'declar2':'declaration'}, inplace=True) print individec2.head() fip2 = fip.merge(individec2) print ' 2.4 : fip2 created' fip1.duplicated().value_counts() fip2.duplicated().value_counts() # #fip <- rbind(fip1,fip2) # fip <- fip1 # table(fip$typ) fip = concat([fip1, fip2]) # fip = fip1 #TODO: Pourquoi cette ligne ? fip.type_pac.value_counts() print fip.columns fip['persfip'] = 'pac' fip['year'] = year fip['year'] = fip['year'].astype('float') # BUG; pas de colonne année dans la DF fip['noi'] = 99 fip['noicon'] = None fip['noindiv'] = fip['declaration'] fip['noiper'] = None fip['noimer'] = None fip['declar1'] = fip['declaration'] #TODO declar ? fip['naim'] = 99 fip['lien'] = None fip['quelfic'] = 'FIP' fip['acteu'] = None fip['agepf'] = fip['year'] - fip['naia'].astype('float') fip['lpr'] = where(fip['agepf'] <=20, 3, 4) # TODO pas très propre d'après Mahdi/Clément fip['stc'] = None fip['contra'] = None fip['titc'] = None fip['mrec'] = None fip['forter'] = None fip['rstg'] = None fip['retrai'] = None fip['cohab'] = None fip['sexe'] = None fip['persfip'] = "pac" fip['agepr'] = None fip['actrec'] = where(fip['agepf']<=15, 9, 5) ## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */ ## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non # Reassigning noi for fip children if they are more than one per foyer fiscal # while ( any(duplicated( fip[,c("noi","ident")]) ) ) { # dup <- duplicated( fip[, c("noi","ident")]) # tmp <- fip[dup,"noi"] # fip[dup, "noi"] <- (tmp-1) # } #TODO: Le vecteur dup est-il correct fip["noi"] = fip["noi"].astype("int64") fip["ident"] = fip["ident"].astype("int64") fip_tmp = fip[['noi','ident']] while any(fip.duplicated(cols=['noi', 'ident'])): fip_tmp = fip.loc[:, ['noi', 'ident']] dup = fip_tmp.duplicated() tmp = fip.loc[dup, 'noi'] print len(tmp) fip.loc[dup, 'noi'] = tmp.astype('int64') - 1 fip['idfoy'] = 100*fip['ident'] + fip['noidec'] fip['noindiv'] = 100*fip['ident'] + fip['noi'] fip['type_pac'] = 0 ; fip['key'] = 0 print fip.duplicated('noindiv').value_counts() save_temp(fip, name="fipDat", year=year) del fip, fip1, individec1, indivifip, indivi, pac print 'fip sauvegardé'
import numpy as np from pandas import Series, DataFrame # 重塑和轴向索引 # 重新排列表格型数据的基础运算。也称为重塑reshape或轴向旋转pivot # 重塑层次化索引 # stack 将数据的列旋转为行 # unstack 将数据的行旋转为列 data = DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index(['Ohio', 'Colorado'], name='state'), columns=pd.Index(['one', 'two', 'three'], name='number')) print data # 使用stack方法,将列旋转为行,得到一个Series result = data.stack() print result # 对于一个层次化的Series,可以使用unstack来重排为一个DataFrame # 默认情况是最内层 print result.unstack() # 可以通过传入参数分层级别的编号或者名称来对别的级别的unstack操作 print result.unstack(0) print result.unstack('state') # 如果不是所有级别值都能在各分组中找到的话,那么unstack会引入缺失值 s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd']) s2 = Series([4, 5, 6], index=['c', 'd', 'e']) data2 = pd.concat([s1, s2], keys=['one', 'two']) print data2 print data2.unstack()
from bokeh.charts import Bar, output_file, show, vplot from numpy.random import rand from pandas import DataFrame N = 10 data = DataFrame({'A': rand(N), 'B': rand(N), 'C': rand(N)}) # Stack columns A,B,C and convert the multiindices to columns sdata = data.stack().reset_index() sdata.columns = ['labels', 'stack', 'values'] bar = Bar(sdata, values='values', label='labels', stack='stack', legend='top_right') bar2 = Bar(sdata, values='values', label='labels', stack='stack', legend='top_right') bar2.x_range = bar.x_range # Link the x axes output_file("stacked_bar.html") show(vplot(bar, bar2))