def cov_section(pair_cols: pd.DataFrame, mu_star: pd.DataFrame) -> pd.Series: """Returns the sectional covariogram of the pairs of function evaluations that resulted from each star point. This function is specific for the time-series varying/aggregate of the VARS sensitivity analysis. Parameters ---------- pair_cols : array_like a Pandas Dataframe of paired values function evaluations mu_star : array_like a Pandas DataFrame of mu star values that are calculated separately Returns ------- cov_section_values : array_like the sectional covariogram dataframe References ---------- .. [1] Razavi, S., & Gupta, H. V. (2016). A new framework for comprehensive, robust, and efficient global sensitivity analysis: 1. Theory. Water Resources Research, 52(1), 423-439. doi: 10.1002/2015WR017558 .. [2] Razavi, S., & Gupta, H. V. (2016). A new framework for comprehensive, robust, and efficient global sensitivity analysis: 1. Application. Water Resources Research, 52(1), 423-439. doi: 10.1002/2015WR017559 """ cov_section_values = (pair_cols.sub(mu_star, axis=0)[0] * pair_cols.sub(mu_star, axis=0)[1]).\ groupby(level=['ts', 'centre', 'param', 'h']).mean() return cov_section_values
def pd_02(): frame=DataFrame(np.arange(12.).reshape(4,3),columns=list('bde'),index=['Ohio', 'Colorado', 'Utah', 'New York']) print frame series=frame.ix[0] print series print frame-series series2=Series(range(3),index=['b','e','f']) print frame+series2 series3=frame['d'] print series3 print frame.sub(series3,axis=0)
def transform(self, time_series: pd.DataFrame) -> pd.DataFrame: """Transform the ``time_series`` by removing the trend. Parameters ---------- time_series: pd.DataFrame, shape (n_samples, 1), required The time series to transform. Returns ------- time_series_t : pd.DataFrame, shape (n_samples, n_features) The transformed time series, without the trend. """ check_is_fitted(self) time_steps = (time_series.index - self.t0_) / self.period_ predictions = pd.Series( index=time_series.index, data=np.array([ TRENDS[self.trend](t, self.best_trend_params_) for t in time_steps ]).flatten(), ) return time_series.sub(predictions, axis=0)
def crdDiff(dMarker: dict, dfUTMh: pd.DataFrame, plotCrds: list, logger: logging.Logger) -> Tuple[pd.DataFrame, dict]: """ calculates the differences of UTM,ellH using reference position or mean position """ cFuncName = colored(os.path.basename(__file__), 'yellow') + ' - ' + colored(sys._getframe().f_code.co_name, 'green') # determine the difference to weighted average or marker position of UTM (N,E), ellH to plot dfCrd = pd.DataFrame(columns=plotCrds) # determine the coordinates of used reference (either mean or user determined) if [dMarker['UTM.E'], dMarker['UTM.N'], dMarker['ellH']] == [np.NaN, np.NaN, np.NaN]: # so no reference position given use mean position originCrds = [float(amc.dRTK['WAvg'][crd]) for crd in plotCrds] else: # make difference to reference position originCrds = [float(amc.dRTK['marker'][crd]) for crd in plotCrds] # subtract origin coordinates from UTMh positions dfCrd = dfUTMh.sub(originCrds, axis='columns') amutils.logHeadTailDataFrame(logger=logger, callerName=cFuncName, df=dfCrd, dfName='dfCrd') crdMax = max(dfCrd.max()) crdMin = min(dfCrd.min()) crdMax = int(crdMax + (1 if crdMax > 0 else -1)) crdMin = int(crdMin + (1 if crdMin > 0 else -1)) dCrdLim = {'max': crdMax, 'min': crdMin} return dfCrd, dCrdLim
def diff(df: pd.DataFrame, subtrahend: str, drop: bool = False) -> Union[pd.DataFrame, list]: """ Verinin subtrahend ile belirtilen sütundan farkını alın. Args: df : Dataframe or List subtrahend : Çıkarılacak sütun adı. Ortalama için 'mean' ayarlanmalı. drop: subtrahend ile belirtilen sütun dataframe'den çıkarılsın mı? Returns: Union[pd.DataFrame, list] """ if isinstance(df, list): diff_list = [] for _df in df: if subtrahend == 'mean': diff_value = _df.mean(axis=1) else: diff_value = _df[subtrahend] diff = _df.sub(diff_value, axis=0) if drop: diff = diff.drop(subtrahend, axis=1) diff_list.append(diff) return diff_list if subtrahend == 'mean': diff_value = df.mean(axis=1) else: diff_value = df[subtrahend] diff = df.sub(diff_value, axis=0) if drop: diff = diff.drop(subtrahend, axis=1) return diff
def test_column_dups_indexing(self): # dup aligning operations should work # GH 5185 df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3]) df2 = DataFrame([1, 2, 3], index=[1, 2, 3]) expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3]) result = df1.sub(df2) tm.assert_frame_equal(result, expected)
def mean_centered(self, utility_matrix: pd.DataFrame) -> pd.DataFrame: """ :param utility_matrix: :return: """ mean_centered_utility_matrix = utility_matrix.sub( utility_matrix.mean()) return self.data.similarity_matrix_cosine(mean_centered_utility_matrix)
def distances_euclid(data: pd.DataFrame, target_index: int) -> pd.Series: # calculate "euclidean" distances: sqrt(sum((actual_rating - target_rating)^2)) # print(data) tmp = data.sub(data.loc[target_index], axis='columns') # print(tmp) tmp = tmp ** 2 # print(tmp) tmp = tmp.sum(axis='columns') # print(tmp) tmp = np.sqrt(tmp) # print(tmp) return tmp
def _calc_assets_returns(self, dataframe: pd.DataFrame) -> pd.DataFrame: """ Calculate dataframe of assets returns Parameters: dataframe: current dataframe Returns: Dataframe with assets returns """ shifted_df = dataframe.shift(1) return dataframe.sub(shifted_df).div(shifted_df)
def transform(self, time_series: pd.DataFrame) -> pd.DataFrame: """Transform the ``time_series`` by removing the trend. Parameters ---------- time_series : ``pd.DataFrame``, required. The time series to transform. Returns ------- transformed_time_series : ``pd.DataFrame`` The transformed time series, without the trend. """ ts = (time_series.index - self.t0_) / self.period_ predictions = pd.Series( index=time_series.index, data=[np.exp(t * self.model_exponent_) for t in ts], ) return time_series.sub(predictions, axis=0)
def transform(self, ts: pd.DataFrame) -> pd.DataFrame: """Transform the ``time_series`` by removing the trend. Parameters ---------- ts: pd.DataFrame, shape (n_samples, 1), required The time series to transform. Returns ------- ts_t : pd.DataFrame, shape (n_samples, n_features) The transformed time series, without the trend. """ check_is_fitted(self) p = np.poly1d(self.model_weights_) time_steps = (ts.index - self.t0_) / self.period_ predictions = pd.Series(index=ts.index, data=[p(t) for t in time_steps]) return ts.sub(predictions, axis=0)
def val_convert_to_zscore(df: pd.DataFrame, mean_base_date: Optional[date], calc_period: Tuple[Optional[date], Optional[date]], output_std_and_mean: bool = False) -> pd.DataFrame: """计算 calc_period 时间跨度内,以 mean_base_date 数据为均值计算 zscore 的值 这里假定 mean_base_date 一定在 calc_period 的时间区段内 NOTE:mean_base_date 值为 None 的时候,计算 mean 值,而不是以某一期的数据作为 mean """ start, end = calc_period df = _filter_df_by_start_end(df, start, end) # 减去某一期的固定值 if mean_base_date is None: series_mean = df.agg("mean", axis=0) else: series_mean = df[df.index == datetime.combine(mean_base_date, datetime.min.time())].iloc[0] df_delta = df.sub(series_mean, axis=1) df_std = df.agg("std", axis=0) df_zscore = df_delta.div(df_std, axis=1) if output_std_and_mean: return df_zscore, series_mean, df_std else: return df_zscore
def test_column_dups_indexing(self): def check(result, expected=None): if expected is not None: tm.assert_frame_equal(result, expected) result.dtypes str(result) # boolean indexing # GH 4879 dups = ["A", "A", "C", "D"] df = DataFrame(np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64") expected = df[df.C > 6] expected.columns = dups df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") result = df[df.C > 6] check(result, expected) # where df = DataFrame(np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64") expected = df[df > 6] expected.columns = dups df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") result = df[df > 6] check(result, expected) # boolean with the duplicate raises df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64") msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df[df.A > 6] # dup aligning operations should work # GH 5185 df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3]) df2 = DataFrame([1, 2, 3], index=[1, 2, 3]) expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3]) result = df1.sub(df2) tm.assert_frame_equal(result, expected) # equality df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"]) df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"]) # not-comparing like-labelled msg = "Can only compare identically-labeled DataFrame objects" with pytest.raises(ValueError, match=msg): df1 == df2 df1r = df1.reindex_like(df2) result = df1r == df2 expected = DataFrame( [[False, True], [True, False], [False, False], [True, False]], columns=["A", "A"], ) tm.assert_frame_equal(result, expected) # mixed column selection # GH 5639 dfbool = DataFrame({ "one": Series([True, True, False], index=["a", "b", "c"]), "two": Series([False, False, True, False], index=["a", "b", "c", "d"]), "three": Series([False, True, True, True], index=["a", "b", "c", "d"]), }) expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1) result = dfbool[["one", "three", "one"]] check(result, expected) # multi-axis dups # GH 6121 df = DataFrame( np.arange(25.0).reshape(5, 5), index=["a", "b", "c", "d", "e"], columns=["A", "B", "C", "D", "E"], ) z = df[["A", "C", "A"]].copy() expected = z.loc[["a", "c", "a"]] df = DataFrame( np.arange(25.0).reshape(5, 5), index=["a", "b", "c", "d", "e"], columns=["A", "B", "C", "D", "E"], ) z = df[["A", "C", "A"]] result = z.loc[["a", "c", "a"]] check(result, expected)
print(df1) print(df2) print(df1.add(df2, fill_value=0)) print(df1.reindex(columns=df2.columns, fill_value=0)) print() print("## Operate between Series and DataFrame:") arr = np.arange(12.).reshape((3, 4)) print(arr) print(arr[0]) print(arr - arr[0]) frame = DataFrame(np.arange(12).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) series = frame.ix[0] print("frame:") print(frame) print("series:") print(series) print(type(frame)) print(type(series)) print(frame - series) series2 = Series(range(3), index=list('bef')) print("series2:") print(series2) print(frame + series2) series3 = frame['d'] print("series3:") print(series3) print(frame.sub(series3, axis=0)) # substract by column(along dim 0)
'large_high': Large_high.values, 'small_high': Small_high.values }, index=dates[1:]) cum_returns.head() """## Cumulative Return Plot of All Strategies""" plt.figure(figsize=(20, 5)) plt.plot(cum_returns) plt.legend(list(cum_returns.columns), loc='upper left') plt.show() """## 5. Sharpe Ratio""" risk_free = Series(d3.RF[1:].values, index=dates[1:]) / 100 summary = pd.DataFrame() summary['excess_ret_total'] = returns.sub( risk_free, axis=0).add(1).resample('A').agg('prod').sub(1).mean() summary['vol_total'] = returns.apply(np.std) * 12**0.5 summary['Sharpe_total'] = summary.excess_ret_total / summary.vol_total summary ##sharpe ratio before 2009 summary_1 = pd.DataFrame() summary_1['excess_ret_before'] = returns.sub( risk_free, axis=0).add(1).resample('A').agg('prod').sub(1)[:'2009-01-01'].mean() summary_1['vol_before'] = returns[:'2009-01-01'].apply(np.std) * 12**0.5 summary_1['Sharpe_before'] = summary_1.excess_ret_before / summary_1.vol_before summary_1 ##sharpe ratio after 2009 summary_2 = pd.DataFrame() summary_2['excess_ret_after'] = returns.sub( risk_free,
frame series frame - series series2 = Series(range(3), index=['b', 'e', 'f']) frame + series2 series3 = frame['d'] frame series3 frame.sub(series3, axis=0) # function application and mapping frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) frame np.abs(frame) f = lambda x: x.max() - x.min() frame.apply(f) frame.apply(f, axis=1) ### NB: a bunch are built in: eg: sum and mean
print frame # b d e # Utah 0 1 2 # Ohio 3 4 5 # Texas 6 7 8 # Oregon 9 10 11 series3 = frame['d'] print series3 # Utah 1 # Ohio 4 # Texas 7 # Oregon 10 # Name: d print frame.sub(series3, axis=0) # b d e # Utah -1 0 1 # Ohio -1 0 1 # Texas -1 0 1 # Oregon -1 0 1 print '######################################################' frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) print frame # b d e # Utah -0.776852 0.976385 0.153123
df1 = DataFrame(np.arange(9).reshape((3,3)), columns=list('bcd'), index=['Ohio','Texas','Colorado']) df2 = DataFrame(np.arange(12).reshape((4,3)),columns=list('bde'), index=['Utah','Ohio','Texas','Oregon']) df1 + df2 # 所谓的对齐,就是索引相同的值做运算 # 填充值,可以给对不上的对象填充一个特殊值 df1.add(df2, fill_value=0) # 这个只会填充df2中没有的对象。 ## DataFrame和Series之间的运算 frame = DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=['Utah','Utahs','Texas','Oregon']) series = frame.ix[0] frame - series # 每列都会减对应元素,这种被称为沿着行广播,如果想要沿着列广播,可以如下操作 series3 = frame['d'] series3 = frame.ix['d',:] frame.sub(series3, axis=0) ## 函数的应用和映射 frame = DataFrame(np.random.randn(4,3), columns=list('bde'), index=['Utah','Ohio','Texas','Oregon']) f = lambda x :x.max()-x.min() # 每列使用函数f frame.apply(f) # 每行使用函数f frame.apply(f, axis=1) # 返回多个值的函数 def f(x): return Series([x.min(), x.max()], index=['min','max']) frame.apply(f) # python的元素级的函数 format = lambda x: '%.2f' % x
Utah 0 NaN 3 NaN Ohio 3 NaN 6 NaN Texas 6 NaN 9 NaN Oregon 9 NaN 12 NaN [4 rows x 4 columns] ''' ''' If you want to instead broadcast over the columns, matching on the rows, you have to use one of the arithmetic methods. For example: ''' series3 = frame['d'] print(frame.sub(series3, axis = 0)) ''' b d e Utah -1 0 1 Ohio -1 0 1 Texas -1 0 1 Oregon -1 0 1 [4 rows x 3 columns] ''' ##################################################### # Function application and mapping ############################################################
def test_column_dups_indexing(self): # dup aligning operations should work # GH 5185 df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3]) df2 = DataFrame([1, 2, 3], index=[1, 2, 3]) expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3]) result = df1.sub(df2) tm.assert_frame_equal(result, expected) # equality df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"]) df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"]) # not-comparing like-labelled msg = "Can only compare identically-labeled DataFrame objects" with pytest.raises(ValueError, match=msg): df1 == df2 df1r = df1.reindex_like(df2) result = df1r == df2 expected = DataFrame( [[False, True], [True, False], [False, False], [True, False]], columns=["A", "A"], ) tm.assert_frame_equal(result, expected) # mixed column selection # GH 5639 dfbool = DataFrame({ "one": Series([True, True, False], index=["a", "b", "c"]), "two": Series([False, False, True, False], index=["a", "b", "c", "d"]), "three": Series([False, True, True, True], index=["a", "b", "c", "d"]), }) expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1) result = dfbool[["one", "three", "one"]] check(result, expected) # multi-axis dups # GH 6121 df = DataFrame( np.arange(25.0).reshape(5, 5), index=["a", "b", "c", "d", "e"], columns=["A", "B", "C", "D", "E"], ) z = df[["A", "C", "A"]].copy() expected = z.loc[["a", "c", "a"]] df = DataFrame( np.arange(25.0).reshape(5, 5), index=["a", "b", "c", "d", "e"], columns=["A", "B", "C", "D", "E"], ) z = df[["A", "C", "A"]] result = z.loc[["a", "c", "a"]] check(result, expected)
arr[0] arr - arr[0] frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) frame series = frame.iloc[0] series frame-series series2 = pd.Series(range(3), index=['b', 'e', 'f']) frame+series2 frame series3 = frame['d'] series3 frame.sub(series3, axis='index') frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) frame np.abs(frame) f = lambda x: x.max()-x.min() frame.apply(f) frame.apply(f, axis='columns') frame.apply(f, axis=1) def f(x): return Series([x.min(), x.max()], index=['min', 'max']) frame.apply(f) format = lambda x: '%.2f' % x frame.applymap(format) #applymap is element wise frame
series2 = Series(range(3), index=['b', 'e', 'f']) frame3 + series2 # In[100]: series3 = frame3['d'] frame3 # In[102]: series3 # In[106]: frame4 = frame3.sub(series3, axis=0) frame4 # In[107]: np.abs(frame4) # In[112]: print(frame3, '\n----------') f = lambda x: x.max() - x.min() frame3.apply(f) # In[110]:
def main(): # reindex obj = Series(range(4), index="a b c d".split(" ")[::-1]) print obj obj2 = obj.reindex("a b c d e".split(" ")) print obj2 # Change NaN print obj.reindex("a b c d e".split(" "), fill_value=0) colors = ["blue", "purple", "yellow"] index = [0, 2, 4] obj3 = Series(colors, index=index) print obj3.reindex(range(6)) print obj3.reindex(range(6), method="ffill") # not found forward fill print obj3.reindex(range(6), method="backfill") # bfill # DataFrame states = ["Ohio", "Texas", "California"] frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"]) print frame frame2 = frame.reindex("a b c d".split(" ")) print frame2 states[0] = "Utah" states[1], states[0] = states[:2] print frame.reindex(columns=states) # fill print frame.reindex("a b c d".split(" "), method="ffill", columns=states) print frame.ix["a b c d".split(" ")] print frame.ix["a b c d".split(" "), states] # Delete column print "", "" obj = Series(range(5), index="a b c d e".split(" ")) new_obj = obj.drop("c") print new_obj print obj # Index reference print "", "" obj = Series(np.arange(4.0), index="a b c d".split(" ")) print obj["b"] print obj[1] # same print obj[2:4] print obj[["b", "a", "c"]] print obj[[1, 3]] print obj[obj < 2] # Slice with label print obj["b":"c"] # include 'c' obj["b":"c"] = 5 print obj data = DataFrame( np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"], ) print data # column print data["two"] print data[["three", "one"]] # row print data[:2] print data[data["three"] > 5] # all values print data < 5 data[data < 5] = 0 print data # row and column print data.ix[["Colorado"], ["two", "three"]] print data.ix[["Colorado", "Utah"], [3, 0, 1]] # row print data.ix[2] # label row and column, return column print data.ix[:"Utah", "two"] # xs # row print data.xs("Utah") print data.xs("Utah", axis=0) # rows print data.xs("two", axis=1) # icol/irow i is index print data.icol(1) print data.irow(1) # Union print "", "" s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"]) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"]) print s1 print s2 # index is union, but d, f, g are NaN print s1 + s2 df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"]) df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print df1 print df2 print df1 + df2 # arithmetic method print "", "" df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd")) df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde")) print df1 print df2 print df1.add(df2, fill_value=0) # reindex has fill_value argument # other arithmetic method are sub/div/mul(ti) # Calculation in a DataFrame and Series print "", "" # subtract from each row. broadcat arr = np.arange(12.0).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) series = frame.ix[0] print frame print series print frame - series series2 = Series(range(3), index=list("bef")) print frame + series2 series3 = frame["d"] series4 = frame.ix[0] print frame print series3 print series4 print frame.sub(series3, axis=0) print frame.sub(series4, axis=1) # apply function and mapping print "", "" frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print frame f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) f = lambda x: Series([x.min(), x.max()], index=["min", "max"]) print frame.apply(f) format = lambda x: "{0:.2f}".format(x) print frame.applymap(format) # frame print frame["e"].map(format) # series # sort and rank print "", "" obj = Series(range(4), index=list("dabc")) print obj print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc")) print frame print frame.sort_index() print frame.sort_index(axis=1) print frame.sort_index(axis=1, ascending=False) # Sorting series print "", "" obj = Series([4, 7, -3, 2]) print obj.order() obj = Series([4, np.nan, 7, np.nan, -3, 2]) print obj.order() print obj.order(ascending=False) # order by multi columns print "", "" frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]}) print frame.sort_index(by=["a", "b"]) # rank print "", "" obj = Series([7, -5, 7, 4, 2, 0, 4]) print obj.rank() # method is average print obj.rank(method="first") # No Duplicates print obj.rank(ascending=False, method="min") print obj.rank(ascending=False, method="max") f1 = DataFrame(obj, columns=["data"]) f2 = DataFrame(obj.rank(), columns=["rank"]) # merge by each index print pd.merge(f1, f2, left_index=True, right_index=True) # Index of the axis with duplicate values print "", "" obj = Series(range(5), index=list("aaabc")) print obj print obj.index.is_unique print obj["a"] print obj["c"] df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd")) print df print df.ix["b"] print df["c"]
''' A B C D a 0 2 4 NaN b 3 5 7 NaN c 6 8 10 NaN ''' series3 = frame.A print series3 ''' a 0 b 3 c 6 ''' print frame.sub(series3, axis=0) # 按列运算 ''' A B C a 0 1 2 b 0 1 2 c 0 1 2 ''' print 'numpy函数在Series/DataFrame的应用' frame = DataFrame(numpy.arange(9).reshape(3, 3), columns=['A', 'B', 'C'], index=['a', 'b', 'c']) print frame '''
frame+se2 # In[151]: se3=frame['b'] # In[152]: se3 # In[153]: frame.sub(se3,axis=0) # ## 7,函数应用和映射 # In[154]: frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=('Utho','Ohio','Ehir','Hude')) # In[155]: frame # In[156]:
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill', columns=states) # Arithmetic and data alignment df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd')) df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde')) df1+df2 df1.add(df2, fill_value=0) # Operations between DataFrame and Series frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) series = frame.ix[0] frame - series #boardcast on each row series2 = frame['d'] frame.sub(series2, axis=0) #boardcast on each column #Function application and mapping------------------------ # numpy的ufunc会被应用到元素级 frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon']) np.abs(frame) frame.abs() # DataFrame的apply默认将函数应用在各列 f = lambda x: x.max() - x.min() #x is an array? frame.apply(f) frame.apply(f,axis=1) #应用于各行 def f(x): return Series([x.min(), x.max()], index = ['min', 'max']) frame.apply(f) #元素级的python函数应该用applymap,Series用map
import numpy as np # DataFrame 생성 frame1 = DataFrame(np.arange(0,9).reshape(3,3), columns=list('abc')) frame2 = DataFrame(np.arange(1,10).reshape(3,3), columns=list('abc')) print(frame1) print(frame2) # frame 덧셈 add = frame1.add(frame2) print(add) # frame 뺄셈 sub = frame2.sub(frame1) print(sub) # frame 나눗셈 div = frame2 / frame1 div = frame2.div(frame1) print(div) # inf : 부모가 0인 경우 # frame 곱셈 mul = frame1.mul(frame2) print(mul) # 행/열 단위 합계/평균/최댓값/최솟값 sum1 = mul.sum(axis = 1) # 행 단위 sum2 = mul.sum(axis = 0) # 열 단위 print('행 단위 합계:\n',sum1)
series = frame.ix[0] print(frame) print(series) print(frame - series) series2 = Series(range(3), index=['b', 'e', 'f']) print(frame + series2) #Только пересечение series3 = frame['d'] print(frame) print(series3) print(frame.sub(series3, axis=0)) #Function application and mapping frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) print(frame) print(np.abs(frame)) f = lambda x: x.max() - x.min() print(frame.apply(f)) print(frame.apply(f, axis=1))
print df1 print df2 print df1 + df2 print print '数据填充' df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns = list('abcd')) df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns = list('abcde')) print df1 print df2 print df1.add(df2, fill_value = 0) print df1.reindex(columns = df2.columns, fill_value = 0) print print 'DataFrame与Series之间的操作' arr = np.arange(12.).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12).reshape((4, 3)), columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon']) series = frame.ix[0] print frame print series print frame - series series2 = Series(range(3), index = list('bef')) print frame + series2 series3 = frame['d'] print frame.sub(series3, axis = 0) # 按列减
from pandas import DataFrame, Series import pandas as pd ############################## basic functions ################################# # basic functions: get partly info data[['Director','id','Gerne','Runtime']] data.ix[['Crazy Asian','Movie 2'],['Director','Runtime']] # basic functions:math oprations df1 = DataFrame(np.arange(12.).reshape((3,4)), columns=list('abcd')) df2 = DataFrame(np.arange(20.).reshape(4,5), columns=list('abcde')) df1-df2 # auto matching, fill NaN df1.add(df2, fill_value=0) df1.sub(df2, fill_value=0) df1.mul(df2, fill_value=0) df1.div(df2, fill_value=0) df1.reindex(columns=df2.columns, fill_value=0) # function map df1 = DataFrame(np.random.randn(4,3), columns=list('abc'), index=['id1','id2','id3','id4']) np.abs(df1) func1 = lambda x: x.max()-x.min() df1.apply(f, axis=1) def f(x): return ([x.max(),x.min()],index=['max','min']) df1.apply(f) format11 = lambda x: '%.2f' % x
series1 = Series(np.arange(4)) series2 = Series(np.arange(1, 5)) print(series1 + series2) print(series1 - series2) # DataFrame + DataFrame df1 = DataFrame(np.arange(16).reshape((4, 4)), columns=list('abcd'), index=np.arange(4)) df2 = DataFrame(np.arange(1, 17).reshape((4, 4)), columns=list('bcde'), index=np.arange(1, 5)) print(df1 + df2) print(df1.add(df2, fill_value=0)) print(df1 - df2) print(df2.sub(df1, fill_value=0)) # DataFrame + Series df = DataFrame(np.arange(12).reshape((4, 3)), columns=list('abc'), index=np.arange(4)) series = Series(np.arange(3), index=df.columns) print(df + series) series = series.reindex(list('bcd')) print(df + series) series = df['a'] print(df.sub(series, axis=0)) # function applying df = DataFrame({'Henry': {'math': 50, 'english': 80, 'phylosophy': 70, 'biology': 20}, 'Kate': {'math': 70, 'biology': 40, 'english': 90, 'phylosophy': 8},
frame6_1 + frame6_2 frame6_1.add(frame6_2, fill_value=0) #傳入frame6_2的數值與一個fill_value參數 frame6_1.reindex(columns=frame6_2.columns, fill_value=0) #再重新索引時,可以指定填充 #DataFrame跟Series之間的運算 yc10 = np.arange(12.).reshape((3, 4)) yc10 yc10 - yc10[1] frame7 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('abc'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) series_7 = frame7.iloc[0] frame7 - series_7 #每一層都會減掉,叫做廣播broadcasting series_7_1 = frame7['b'] frame7.sub(series_7_1, axis=0) ##函數應用與映射 frame8 = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Ohio', 'Utah', 'Texas', 'Oregon']) frame8 frame8.abs() # = np.abs(frame8) 取絕對值 f = lambda x: x.max() - x.min() frame8.apply(f) frame8.apply(f, axis=1) def f(x): return Series([x.min(), x.max()], index=['min', 'max'])
print df1 print print df2 print print df1.add(df2, fill_value = 0) print df1.reindex(columns = df2.columns, fill_value = 0) print print 'DataFrame与Series之间的操作' arr = np.arange(12.).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12).reshape((4, 3)), columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon']) series = frame.ix[0] print frame print series print print frame - series print print frame print series2 = Series(range(3), index = list('bef')) print frame + series2 series3 = frame['d'] print series3 print print frame.sub(series3, axis = 0) # 按列减
import numpy as np import pandas as pd from pandas import Series, DataFrame s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e']) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g']) print s1 + s2 df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado']) print df1 df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) print df2 print df1 + df2 print df1.add(df2, fill_value=0) arr = np.arange(12.).reshape((3, 4)) print arr - arr[0] frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) series = frame.ix[0] series2 = Series(range(3), index=['b', 'e', 'f']) print frame + series2 series3 = frame['d'] print frame.sub(series3, axis=0) series4 = frame.ix['Utah'] print frame.sub(series4, axis=1)
def get_diff_dataframe(df: pd.DataFrame, df_pre: pd.DataFrame): diff = df.sub(df_pre, fill_value=0) return diff
def test_column_dups_indexing(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # boolean indexing # GH 4879 dups = ['A', 'A', 'C', 'D'] df = DataFrame(np.arange(12).reshape(3, 4), columns=[ 'A', 'B', 'C', 'D'], dtype='float64') expected = df[df.C > 6] expected.columns = dups df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype='float64') result = df[df.C > 6] check(result, expected) # where df = DataFrame(np.arange(12).reshape(3, 4), columns=[ 'A', 'B', 'C', 'D'], dtype='float64') expected = df[df > 6] expected.columns = dups df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype='float64') result = df[df > 6] check(result, expected) # boolean with the duplicate raises df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype='float64') self.assertRaises(ValueError, lambda: df[df.A > 6]) # dup aligining operations should work # GH 5185 df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3]) df2 = DataFrame([1, 2, 3], index=[1, 2, 3]) expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3]) result = df1.sub(df2) assert_frame_equal(result, expected) # equality df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=['A', 'B']) df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=['A', 'A']) # not-comparing like-labelled self.assertRaises(ValueError, lambda: df1 == df2) df1r = df1.reindex_like(df2) result = df1r == df2 expected = DataFrame([[False, True], [True, False], [False, False], [ True, False]], columns=['A', 'A']) assert_frame_equal(result, expected) # mixed column selection # GH 5639 dfbool = DataFrame({'one': Series([True, True, False], index=['a', 'b', 'c']), 'two': Series([False, False, True, False], index=['a', 'b', 'c', 'd']), 'three': Series([False, True, True, True], index=['a', 'b', 'c', 'd'])}) expected = pd.concat( [dfbool['one'], dfbool['three'], dfbool['one']], axis=1) result = dfbool[['one', 'three', 'one']] check(result, expected) # multi-axis dups # GH 6121 df = DataFrame(np.arange(25.).reshape(5, 5), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'C', 'D', 'E']) z = df[['A', 'C', 'A']].copy() expected = z.loc[['a', 'c', 'a']] df = DataFrame(np.arange(25.).reshape(5, 5), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'C', 'D', 'E']) z = df[['A', 'C', 'A']] result = z.loc[['a', 'c', 'a']] check(result, expected)
# s1의 0, 1, 2의 값이 df의 b d e에 모두 계산된다. s2 = Series(range(3), index=list('bef')) print(s2) print(df + s2) s3 = df['d'] print(s3) print(df + s3) # index가 완전히 새롭게 추가되는 경우이기 때문에 모두 NaN값이 뜨는 결과가 나온다. # 행에 대한 연산을 수행해야 할 경우에는 함수를 이용한다. (add, sub 등) axis값을 주면 된다. print(df.add(s3, axis=0)) print(df.sub(s3, axis=0)) # 함수 적용과 매핑 ## 배열의 각 원소에 적용되는 함수를 유니버셜 함수라 한다. # numpy.random 모듈에 있는 randn 함수는 임의의 정규분표 데이터를 생성한다. df = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['seoul', 'busan', 'daegu', 'incheon']) print(df) print(np.abs(df)) #절대값으로 변환하는 함수 f = lambda x: x.max() - x.min()
print data.ix[:'Utha','two'] print '----print data.ix[data.three>5,:3]-----' print data.ix[data.three>5,:3] print "#-------广播------#" frame = DataFrame(np.arange(12.).reshape(4,3), index=['Utah','Ohio','Texas','Oregon'], columns=list('bde') ) print frame series = frame['d'] print series print frame.state['Utah'] print frame.sub(series,axis=0) #默认广播方式是将series当做一行来减,即按列广播 沿着行一直向下广播
def practice_two(): # 重新索引 reindex obj = Series(['b', 'p', 'y'], index=[0, 2, 4]) obj.reindex(range(6), method='ffill') ''' ffill 前向填充值 bfill 后向填充值 pad 前向搬运值 backfill 后向搬运值 ''' frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California']) # 3行3列的数组,行索引为index,列索引为columns frame2 = frame.reindex(['a', 'b', 'c', 'd']) # 添加索引为b这一行 states = ['Texas', 'Utah', 'California'] frame.reindex(columns=states) # 使用columns可重新索引列 ''' reindex函数的参数 index 用作索引的新序列 method 插值方式 fill_value 重新索引的过程中,需要引入缺失值时使用的代替值 limit 前向或后向填充时的最大填充量 level 在Multilndex的指定级别上匹配简单索引,否则取其子集 copy 默认True,无论如何都复制;若为False,则新旧相等不复制 ''' # 丢弃指定轴上的项 obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e']) obj.drop('c') # 删除c行 obj.drop(['d', 'c']) # 删除d,c行 data = DataFrame(np.arange(16).reshape((4, 4)), index=['o', 'c', 'u', 'n'], columns=['one', 'two', 'three', 'four']) data.drop(['two', 'four'], axis=1) # 删除列,two,four # 索引,选取,过滤 obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd']) obj['b'] # 等价于obj[1] obj[2:4] obj[['b', 'a', 'd']] obj[[1, 3]] obj[obj < 2] obj['b':'c'] obj['b':'c'] = 5 # 修改值 ''' DataFrame的索引选项 obj[val] 选取单列或一组列 obj.ix[val] 单行或一组行 obj.ix[val1, val2] 同时选取行和列 reindex方法 将一个或多个轴匹配到新索引 xs方法 根据标签选取单行或单列,返回Series icol,irow方法 根据整数位置选取单列或单行,返回Series get_value,set_value方法 根据行标签和列标签选取单个值 ''' # 算术运算和数据对齐 s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e']) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'd', 'e', 'f', 'g']) s1 + s2 # 在不重叠的索引处引入NA值 # 同样会发生在DataFrame上 s1.add(s2, fill_value=0) # 不会出现NA值,单纯加 s1.reindex(columns=s2.columns, fill_value=0) # 指定值 ''' add + sub - div / mul * ''' frame = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('bde'), index=['U', 'O', 'T', 'R']) series = frame.ix[0] frame - series series2 = Series(range(3), index=['b', 'e', 'f']) frame + series2 # 会出现NA值 series3 = frame['d'] frame.sub(series3, axis=0) # 函数应用与映射 frame = DataFrame(np.random.randn(3, 4), columns=list('bde'), index=['U', 'O', 'T', 'R']) np.abs(frame) # 绝对值 f = lambda x: x.max() - x.min() frame.apply(f) frame.apply(f, axis=1) format = lambda x: '%.2f' % x frame.applymap(format) frame['e'].map(format) # 排序和排名 ''' .sort_index() 按字典顺序排序 行 .sort_index(axis=1) 列 .sort_index(ascending=False) 降序,默认升 .order() 对Series .sort_index(by='*') 针对*列 .rank(ascending=False,method='first',axis=1) # 'average' 默认,平均 'min' 最小 'max' 最大 'first' 按值在原始出现顺序分配排名 ''' # 带有重复值的轴索引 obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c']) obj.index.is_unique # 值是否唯一 pass
def test_column_dups_indexing(self): def check(result, expected=None): if expected is not None: assert_frame_equal(result, expected) result.dtypes str(result) # boolean indexing # GH 4879 dups = ['A', 'A', 'C', 'D'] df = DataFrame(np.arange(12).reshape(3, 4), columns=['A', 'B', 'C', 'D'], dtype='float64') expected = df[df.C > 6] expected.columns = dups df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype='float64') result = df[df.C > 6] check(result, expected) # where df = DataFrame(np.arange(12).reshape(3, 4), columns=['A', 'B', 'C', 'D'], dtype='float64') expected = df[df > 6] expected.columns = dups df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype='float64') result = df[df > 6] check(result, expected) # boolean with the duplicate raises df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype='float64') msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df[df.A > 6] # dup aligning operations should work # GH 5185 df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3]) df2 = DataFrame([1, 2, 3], index=[1, 2, 3]) expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3]) result = df1.sub(df2) assert_frame_equal(result, expected) # equality df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=['A', 'B']) df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=['A', 'A']) # not-comparing like-labelled msg = "Can only compare identically-labeled DataFrame objects" with pytest.raises(ValueError, match=msg): df1 == df2 df1r = df1.reindex_like(df2) result = df1r == df2 expected = DataFrame( [[False, True], [True, False], [False, False], [True, False]], columns=['A', 'A']) assert_frame_equal(result, expected) # mixed column selection # GH 5639 dfbool = DataFrame({ 'one': Series([True, True, False], index=['a', 'b', 'c']), 'two': Series([False, False, True, False], index=['a', 'b', 'c', 'd']), 'three': Series([False, True, True, True], index=['a', 'b', 'c', 'd']) }) expected = pd.concat([dfbool['one'], dfbool['three'], dfbool['one']], axis=1) result = dfbool[['one', 'three', 'one']] check(result, expected) # multi-axis dups # GH 6121 df = DataFrame(np.arange(25.).reshape(5, 5), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'C', 'D', 'E']) z = df[['A', 'C', 'A']].copy() expected = z.loc[['a', 'c', 'a']] df = DataFrame(np.arange(25.).reshape(5, 5), index=['a', 'b', 'c', 'd', 'e'], columns=['A', 'B', 'C', 'D', 'E']) z = df[['A', 'C', 'A']] result = z.loc[['a', 'c', 'a']] check(result, expected)
RF = ensemble.RandomForestRegressor() trainY = DataFrame(trainSample["SalePrice"]) trainX = trainSample.drop(columns="SalePrice") testY = DataFrame(testSample["SalePrice"]) testX = testSample.drop(columns="SalePrice") BR.fit(trainX,trainY.values.ravel()) PC.fit(trainX,trainY.values.ravel()) RF.fit(trainX,trainY.values.ravel()) SVR.fit(trainX,trainY.values.ravel()) resultBR = BR.predict(testX) resultPC = PC.predict(testX) resultSVR = SVR.predict(testX) resultRF = RF.predict(testX) resultBRdf = DataFrame(resultBR, columns=["SalePrice"]) differenceBR = resultBRdf.sub(testY.reset_index(drop=True)) resultPCdf = DataFrame(resultPC, columns=["SalePrice"]) differencePC = resultPCdf.sub(testY.reset_index(drop=True)) resultRFdf = DataFrame(resultRF, columns=["SalePrice"]) differenceRF = resultRFdf.sub(testY.reset_index(drop=True)) resultSVRdf = DataFrame(resultSVR, columns=["SalePrice"]) differenceSVR = resultSVRdf.sub(testY.reset_index(drop=True)) print(metrics.mean_absolute_error(resultBR, testY.reset_index(drop=True))) print(metrics.mean_absolute_error(resultPC, testY.reset_index(drop=True))) print(metrics.mean_absolute_error(resultRF, testY.reset_index(drop=True))) print(metrics.mean_absolute_error(resultSVR, testY.reset_index(drop=True)))