def test_binary_ops_align(self): # test aligning binary ops # GH 6681 index = MultiIndex.from_product( [list("abc"), ["one", "two", "three"], [1, 2, 3]], names=["first", "second", "third"] ) df = DataFrame( np.arange(27 * 3).reshape(27, 3), index=index, columns=["value1", "value2", "value3"] ).sortlevel() idx = pd.IndexSlice for op in ["add", "sub", "mul", "div", "truediv"]: opa = getattr(operator, op, None) if opa is None: continue x = Series([1.0, 10.0, 100.0], [1, 2, 3]) result = getattr(df, op)(x, level="third", axis=0) expected = pd.concat([opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems()]).sortlevel() assert_frame_equal(result, expected) x = Series([1.0, 10.0], ["two", "three"]) result = getattr(df, op)(x, level="second", axis=0) expected = pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems()]).reindex_like(df).sortlevel() assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) midx = MultiIndex.from_product([["A", "B"], ["a", "b"]]) df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx) s = pd.Series({"a": 1, "b": 2}) df2 = df.copy() df2.columns.names = ["lvl0", "lvl1"] s2 = s.copy() s2.index.name = "lvl1" # different cases of integer/string level names: res1 = df.mul(s, axis=1, level=1) res2 = df.mul(s2, axis=1, level=1) res3 = df2.mul(s, axis=1, level=1) res4 = df2.mul(s2, axis=1, level=1) res5 = df2.mul(s, axis=1, level="lvl1") res6 = df2.mul(s2, axis=1, level="lvl1") exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx) for res in [res1, res2]: assert_frame_equal(res, exp) exp.columns.names = ["lvl0", "lvl1"] for res in [res3, res4, res5, res6]: assert_frame_equal(res, exp)
def getRecommendations(dfobj, people, similarity=sim_pearson): dfobj2 = dfobj.drop(people, axis=1) cor_df = Series(dict([(p, similarity(dfobj, p, people)) for p in dfobj2])) mul_df = DataFrame.mul(dfobj2.T, cor_df[cor_df > 0], axis=0) return Series.div( mul_df.sum(), DataFrame.mul(pd.notnull(mul_df), cor_df, axis=0).sum()).order(ascending=False)[pd.isnull( dfobj[people])]
def update_factor(self): self.generate_factor() #if 'industry' in self.neutral_list: if True: industrys = tools.get_industrys('L1', self.stocks) tmp = {} for k in industrys.keys(): if len(industrys[k]) > 0: tmp[k] = industrys[k] industrys = tmp factor = tools.standardize_industry(self.factor, industrys) #if 'market_capitalization' in self.neutral_list: if False: market_capitalization = DataFrame({stock: pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv'%(gc.DATABASE_PATH, stock), index_col=[0], parse_dates=[0]).loc[:, 'TOTMKTCAP'] for stock in self.stocks}) market_capitalization = np.log(market_capitalization) if self.start_date: market_capitalization = market_capitalization.loc[market_capitalization.index >= self.start_date, :] if self.end_date: market_capitalization = market_capitalization.loc[market_capitalization.index <= self.end_date, :] #if 'industry' in self.neutral_list: if True: market_capitalization = tools.standardize_industry(market_capitalization, industrys) beta = (factor * market_capitalization).sum(1) / (market_capitalization * market_capitalization).sum(1) factor = factor - market_capitalization.mul(beta, axis=0) self.factor.fillna(0, inplace=True) if os.path.exists('%s/Data/%s.csv'%(gc.FACTORBASE_PATH, self.factor_name)): factor_old = pd.read_csv('%s/Data/%s.csv'%(gc.FACTORBASE_PATH, self.factor_name), index_col=[0]) factor = pd.concat([factor_old, factor.loc[factor.index>factor.index[-1], :]], axis=0) factor.sort_index(axis=0, inplace=True) factor.sort_index(axis=1, inplace=True) factor.to_csv('%s/Data/%s.csv'%(gc.FACTORBASE_PATH, self.factor_name))
def chi(self, customattribute): """ 计算其卡方值. """ attributeDict = dict() classAttributeDict = dict() for piece in self.chunks: for (attribute, classAttribute), arrays in piece.groupby([customattribute, self.classAttribute]).studentID.unique().iteritems(): attributeDict.setdefault((attribute, classAttribute), np.array([])) attributeDict[(attribute, classAttribute)] = np.union1d(attributeDict[(attribute, classAttribute)], arrays) for classAttribute, arrays in piece.groupby(self.classAttribute).studentID.unique().iteritems(): classAttributeDict.setdefault(classAttribute, np.array([])) classAttributeDict[classAttribute] = np.union1d(classAttributeDict[classAttribute], arrays) #各个类别的毕业去向群体中所占的比例. classSeries = Series(classAttributeDict).apply(lambda x:len(x)) classSeries /= classSeries.sum() #在各个attribute上的实际观测值. attributeObs = Series(attributeDict).apply(lambda x:len(x)).unstack(fill_value=0) attributeExp = DataFrame(index=attributeObs.index, columns=attributeObs.columns) #设置初始值. for index in attributeExp.index: attributeExp.ix[index] = attributeObs.ix[index].sum() #根据各个目标类别中的比例来获得其期望值. attributeExp = attributeExp.mul(classSeries).fillna(0) #根据实际观测值与期望值来计算其卡方值,并返回p-value值. return chisquare(attributeObs.stack(), attributeExp.stack()), attributeObs
def return_weight(self, fact: pd.DataFrame, fact_ret: pd.DataFrame = None, hp: int = 1, rp: int = 20, algorithm='mean') -> [pd.Series, None]: """ 由于该地方的权重(Pearson相关性和Spearman相关性)权重都是作为标签参与了运算, 因此相对于截面当期该数据为未来数据,需要进行平移后与相应的因子进行匹配才能作为当期截面因子的历史权重, 系统默认计算收益率采用open价格,所以,若调仓周期为N天,则需要平移 N + 1 + 1天。 :param fact: 标准化后的因子 :param fact_ret: 因子收益率 :param rp: 权重滚动计算周期 :param hp: 标的持有周期(调仓周期) :param algorithm: 权重计算方法 :return: """ fact_weight = abs(self._weight(fact_ret, rp, algorithm)) # 权重归一化 fact_weight_std = fact_weight.div(fact_weight.sum(axis=1), axis=0) # 权重与因子值匹配 fact_weight_std = fact_weight_std.shift(hp + 1) # TODO 不同的价格平移周期不一样 # 复合因子 fact_comp = fact.mul(fact_weight_std).sum(axis=1) return fact_comp
def MAX_IC_IR(self, fact: pd.DataFrame, fact_ret: pd.DataFrame = None, hp: int = 1, rp: int = 20, way='IC_IR', comp_name: str = 'comp_factor'): # 对收益率进行调整 ret_real = fact_ret.shift(hp).dropna() w_list = [] for i in range(rp, ret_real.shape[0] + 1): df_ = ret_real.iloc[i - rp:i, :] opt = self.OPT(df_) if way == 'IC': opt.data_cov = np.array(fact.loc[df_.index].cov()) res_ = opt.solve() weight_ = res_.x w_s = pd.Series(weight_, index=df_.columns, name=df_.index[-1]) w_list.append(w_s) w_df = pd.DataFrame(w_list) # W = w_df.shift(hp) fact_comp = fact.mul(w_df).sum(axis=1) fact_comp.name = fact_comp return fact_comp
def retWeighted(self, fact: pd.DataFrame, factWeight: pd.DataFrame, algorithm: str = 'RetMean', **kwargs) -> pd.Series(float): """ Parameters ---------- factWeight : fact : algorithm : RetMean: 历史收益率均值, HalfTime: 历史收益率半衰加权 kwargs : Returns ------- """ if algorithm != 'equal': # 生成权重 factWeightNew = abs(self._weight(factWeight, self.rp, algorithm)) # 权重归一化 factWeightStand = factWeightNew.div(factWeightNew.sum(axis=1), axis=0) # 权重与因子值匹配 factWeightStand = factWeightStand.shift(self.hp + 1) # 复合因子 fact_comp = fact.mul(factWeightStand).sum(axis=1) else: fact_comp = fact.groupby(KN.TRADE_DATE.value, group_keys=False).apply(lambda x: x.mean(axis=1)) return fact_comp
def multiple_predictions(self, features: DataFrame) -> Series: rows, columns = features.shape if len(self.gradients) != columns: raise ValueError( f"Features not the same length as gradients! Features: {columns}, Gradients: {len(self.gradients)}" ) return features.mul(self.gradients).sum(1).add(self.theta_0)
def structural_adj(self, cov: pd.DataFrame, spec_ret: pd.DataFrame, fact_exp: pd.DataFrame, liq_mv: pd.DataFrame, liq_mv_name: PVN.LIQ_MV.value, time_window: int = 120): """ :param cov: 经Newey-West调整的个股特异收益矩阵 :param spec_ret: 个股特异收益序列 :param fact_exp: 因子暴露 :param liq_mv: 流通市值 :param liq_mv_name: 流通市值名称 :param time_window: 个股特异收益的时间窗口(后面考虑改为特异收益序列的长度) :return: """ # 计算协调参数 h_n = spec_ret.count() # 非空数量 V_n = (h_n - 20 / 4) / 20 * 2 # 数据缺失程度(先用20测试) sigma_n = spec_ret.std().fillna(1) # 样本等权标准差(无法计算的标准差记为1) TODO sigma_n_steady = (spec_ret.quantile(.75) - spec_ret.quantile(0.25)) / 1.35 # 样本稳健估计标准差 Z_n = abs((sigma_n - sigma_n_steady) / sigma_n_steady) # 数据肥尾程度 # 将无限大值替换为0 Z_n[np.isinf(Z_n)] = 0 Z_n.fillna(0, inplace=True) left_, right_ = V_n.where(V_n > 0, 0), np.exp(1 - Z_n) left_, right_ = left_.where(left_ < 1, 1), right_.where(right_ < 1, 1) gam_n = left_ * right_ # 个股协调参数[0,1] reg_data = pd.concat([np.log(sigma_n), liq_mv, gam_n, fact_exp], axis=1) reg_data.columns = ['sigma', liq_mv_name, 'gam_n' ] + fact_exp.columns.tolist() ref_data_com = reg_data[reg_data['gam_n'] == 1] # 加权(流通市值)最小二乘法用优质股票估计因子对特异波动的贡献值 model = sm.WLS(ref_data_com['sigma'], ref_data_com[fact_exp.columns], weights=ref_data_com['gam_n']).fit() # 个股结构化特异波动预测值 sigma_STR = pd.DataFrame(np.diag( np.exp(np.dot(fact_exp, model.params)) * 1.05), index=fact_exp.index, columns=fact_exp.index) # 对特异收益矩阵进行结构化调整 F_STR = sigma_STR.mul((1 - gam_n), axis=0) + cov.mul(gam_n, axis=0) return F_STR
def mean_aggregate( indices: pd.DataFrame, weight_shares: FrameOrSeriesUnion, axis: int = 1, ) -> pd.Series: """Aggregates indices and weight shares using sum product.""" # min_count set to 1 to prevent function returning 0 when all # values being summed are NA return indices.mul(weight_shares).sum(axis=axis, min_count=1)
def partial_correlation(C,X,S,N): S = DataFrame(S) C = Series(C) X = Series(X) #linear fit between S and C linreg = linear_model.LinearRegression() linreg.fit(S,C) coef_c=linreg.coef_ intercept_c = linreg.intercept_ R_c=C-(S.mul(coef_c,axis=1).sum()+intercept_c)#residuals of C and S #linear fit between S and X linreg.fit(S,X) coef_x = linreg.coef_ intercept_x = linreg.intercept_ R_x=X-(S.mul(coef_x,axis=1).sum()+intercept_x)#residuals for X and S numerator = N*(R_c*R_x).sum() - R_c.sum()*R_x.sum() denominator=np.sqrt(N*np.square(R_c).sum()-np.square(R_c.sum()))*\ np.sqrt(N*np.square(R_x).sum()-np.square(R_x.sum())) rou = numerator*1.0/denominator return rou
def __get_a_portfolio(self, df: pd.DataFrame) -> Union[pd.DataFrame, None]: """ The method is needed to get the portfolio. The method for calculating the portfolio by multiplying the previously calculated table by the weight and summing the columns :param df: R[i,t], CR[i,t] or TR[i,t]. :return: Portfolio DataFrame """ if self._df_raw['weights'] is None: return None return df.mul(self._df_raw['weights']).sum(axis=1, skipna=False)
def PCA(self, fact: pd.DataFrame, rp: int = 20): w_list = [] for i in range(rp, fact.shape[0] + 1): df_ = fact.iloc[i - rp:i, :] pca = PCA(n_components=1) pca.fit(np.array(df_)) weight = pca.components_[0] w_s = pd.Series(data=weight, index=df_.columns, name=df_.index[-1]) w_list.append(w_s) w_df = pd.DataFrame(w_list) fact_comp = fact.mul(w_df).sum(axis=1) fact_comp.name = fact_comp return fact_comp
def optPort_nco(cov, mu=None, maxNumClusters=10): # Portfolio optimizataion function using NCO method cov = DataFrame(cov) if mu is not None: mu = Series(mu[:, 0]) corr1 = cov2corr(cov) corr1, clstrs, _ = clusterKMeansBase(corr1, maxNumClusters, n_init=10) wIntra = DataFrame(0, index=cov.index, columns=clstrs.keys()) for i in clstrs: cov_ = cov.loc[clstrs[i], clstrs[i]].values if mu is None: mu_ = None else: mu_ = mu.loc[clstrs[i]].values.reshape(-1, 1) wIntra.loc[clstrs[i], i] = optPort(cov_, mu_).flatten() cov_ = wIntra.T.dot(np.dot(cov, wIntra)) mu_ = (None if mu is None else wIntra.T.dot(mu)) wInter = Series(optPort(cov_, mu_).flatten(), index=cov_.index) nco = wIntra.mul(wInter, axis=1).sum(axis=1).values.reshape(-1, 1) return nco
def _calculate_delta(resp_matrix: pd.DataFrame, meas_dict: dict, keys: Sequence[str], vars_list: Sequence[str], method: str, meth_opt): """Get the deltas for the variables. Output is Dataframe with one column 'DELTA' and vars_list index.""" weight_vector = _join_columns(f"{WEIGHT}", meas_dict, keys) diff_vector = _join_columns(f"{DIFF}", meas_dict, keys) resp_weighted = resp_matrix.mul(weight_vector, axis="index") diff_weighted = diff_vector * weight_vector delta = _get_method_fun(method)(resp_weighted, diff_weighted, meth_opt) delta = tfs.TfsDataFrame(delta, index=vars_list, columns=[DELTA]) # check calculations update = np.dot(resp_weighted, delta[DELTA]) _print_rms(meas_dict, diff_weighted, update) return delta
def generate_factor(self): CLOSE = DataFrame({ stock: pd.read_csv('%s/StockDailyData/Stock/%s.csv' % (gc.DATABASE_PATH, stock), index_col=[0], parse_dates=[0]).loc[:, 'close'] for stock in self.stocks }) ADJ = DataFrame({ stock: pd.read_csv('%s/StockDailyData/Stock/%s.csv' % (gc.DATABASE_PATH, stock), index_col=[0], parse_dates=[0]).loc[:, 'adj_factor'] for stock in self.stocks }) CLOSE = CLOSE * ADJ CLOSE.fillna(method='ffill', inplace=True) r = np.log(CLOSE).diff() r_m = r.mean(1) r_m = DataFrame({stock: r_m for stock in r.columns}) n = 20 def reg(y, x, n): lxx = (x**2).rolling(n).sum() - n * (x.rolling(n).mean()**2) lxy = (x * y).rolling( n).sum() - n * x.rolling(n).mean() * y.rolling(n).mean() beta = lxy / lxx alpha = y.rolling(n).mean() - beta * x.rolling(n).mean() return alpha, beta alpha, beta = reg(r, r_m, n) e = r.subtract(alpha, 1) - r_m.mul(beta, 1) m = n a = e.rolling(m).std() a = np.log(a) a = a.loc[a.index >= self.start_date, :] a = a.loc[a.index <= self.end_date, :] self.factor = a
def placeAnalysize(self, placeAttribute, amountAttribute): placeAttributeDict = dict() classAttributeDict = dict() for piece in self.chunks: for work, df in piece[piece['type'] == '消费'].groupby( self.classAttribute): classAttributeDict.setdefault(work, Series([])) classAttributeDict[work] = classAttributeDict[work].add( df.groupby('studentID')[amountAttribute].sum(), fill_value=0) for (place, work), df in piece[piece['type'] == '消费'].groupby( [placeAttribute, self.classAttribute]): placeAttributeDict.setdefault((place, work), Series([])) placeAttributeDict[(place, work)] = placeAttributeDict[( place, work)].add(df.groupby('studentID')[amountAttribute].sum(), fill_value=0) #获得每类群体的实际观测值. dfObs = Series(placeAttributeDict).apply(lambda x: x.mean()).unstack( fill_value=0) #获得每类群体的消费金额占总体消费金额的实际比例.相当于先验概率. placeAttributeExp = Series(classAttributeDict).apply(lambda x: x.sum()) placeAttributeExp = placeAttributeExp / placeAttributeExp.sum() placeAttributeExp.fillna(0) #根据每类群体的理论分布值来计算其每类群体的理论观测值. dfExp = DataFrame([], index=dfObs.index, columns=dfObs.columns) df = Series(placeAttributeDict).apply(lambda x: x.sum()).unstack( fill_value=0) for index in dfObs.index: dfExp.ix[index] = df.ix[index].sum() dfExp = dfExp.mul(placeAttributeExp).fillna(0) / (Series( placeAttributeDict).apply(lambda x: x.size)).unstack(fill_value=0) #注意理解卡方的计算方式,期望值为0时,则所计算出的卡方值是会有问题的. dfExp = dfExp.replace([np.inf, -np.inf], np.nan).fillna(0.000001) #返回其计算的卡方值,pvalue值,其期望的值与实际观测的值. return chisquare(dfObs.stack(), dfExp.stack()), dfExp, dfObs
def add_noise(df: DataFrame, a=-1, b=1, method='add') -> DataFrame: """ Add noise to all rows in a DataFrame :param df: The DataFrame we want to add noise to :param a: lower limit of random number that we'll use :param b: upper limit of random number that we'll use :param method: 'add' or 'mul'. The method used to add noise: add or multiply every value with a random value :return: A new DataFrame with random numbers between 'a' and 'b' added to every row """ import numpy as np rand_vector = (b - a) * np.random.random_sample( (len(df), len(df.columns))) + a print(rand_vector) if method == 'add': return df.add(rand_vector, axis=0) if method == 'mul': return df.mul(rand_vector, axis=0) raise AssertionError("Method must be either 'add' or 'mul'")
def chi(self, customattribute): """ 计算其卡方值. """ attributeDict = dict() classAttributeDict = dict() for piece in self.chunks: for (attribute, classAttribute), arrays in piece.groupby( [customattribute, self.classAttribute]).studentID.unique().iteritems(): attributeDict.setdefault((attribute, classAttribute), np.array([])) attributeDict[(attribute, classAttribute)] = np.union1d( attributeDict[(attribute, classAttribute)], arrays) for classAttribute, arrays in piece.groupby( self.classAttribute).studentID.unique().iteritems(): classAttributeDict.setdefault(classAttribute, np.array([])) classAttributeDict[classAttribute] = np.union1d( classAttributeDict[classAttribute], arrays) #各个类别的毕业去向群体中所占的比例. classSeries = Series(classAttributeDict).apply(lambda x: len(x)) classSeries /= classSeries.sum() #在各个attribute上的实际观测值. attributeObs = Series(attributeDict).apply(lambda x: len(x)).unstack( fill_value=0) attributeExp = DataFrame(index=attributeObs.index, columns=attributeObs.columns) #设置初始值. for index in attributeExp.index: attributeExp.ix[index] = attributeObs.ix[index].sum() #根据各个目标类别中的比例来获得其期望值. attributeExp = attributeExp.mul(classSeries).fillna(0) #根据实际观测值与期望值来计算其卡方值,并返回p-value值. return chisquare(attributeObs.stack(), attributeExp.stack()), attributeObs
def process(self, data: pd.DataFrame, factWeight: pd.DataFrame, method: str = 'Equal', rp: int = 60, hp: int = 5, **kwargs) -> pd.DataFrame: """ 部分权重会用到未来数据,所以需要对权重进行平移与相应的因子值进行匹配 Parameters ---------- hp : 持有期 rp : 滚动周期 data : 因子集 factWeight :因子权重 method : 因子合成方法 kwargs : Returns ------- """ self.rp, self.hp = rp, hp factDir = np.sign(factWeight.rolling(rp, min_periods=1).mean()) factDir = factDir.shift(hp + 1) # 收益率为标签(预测值), 历史收益数据加权需要+ 1 # 因子转为正向因子,同时因子收益等指标调整为单调状态 factNew = data.mul(factDir, level=0).dropna() factWeightNew = factWeight.abs() method_dict = {"RetWeight": self.retWeighted, "OPT": self.MAX_IC_IR } if method is None: return data res = method_dict[method](fact=factNew, factWeight=factWeightNew, **kwargs) return res
def dataFrameMathTest(): #Note : The methods that return a series default to working on columns. df = DataFrame() # Load a DataFrame from a CSV file org_df = pd.read_csv('mlg.csv') df = org_df.iloc[:,1:7] resAbs = df.abs() # absolute values print(resAbs) #resAdd = df.add(o) # add df, Series or value #print(resAdd) resCount = df.count() # non NA/null values print(resCount) resCumMax = df.cummax() # (cols default axis) print(resCumMax) resCumMin = df.cummin() # (cols default axis) print(resCumMin) resCumSum = df.cumsum() # (cols default axis) print(resCumSum) resDiff = df.diff() # 1st diff (col def axis) print(resDiff) resDiv = df.div(12) # div by df, Series, value print(resDiv) #resDot = df.dot(13) # matrix dot product #print(resDot) resMax = df.max() # max of axis (col def) print(resMax) resMean = df.mean() # mean (col default axis) print(resMean) resMedian = df.median()# median (col default) print(resMedian) resMin = df.min() # min of axis (col def) print(resMin) resMul = df.mul(2) # mul by df Series val print(resMul) resSum = df.sum() # sum axis (cols default) print(resSum) resWhere = df.where(df > 0.5, other=np.nan) print(resWhere)
def MAX_IC_IR(self, fact: pd.DataFrame, factWeight: pd.DataFrame, retType='IC_IR') -> pd.Series(float): # 设置优化方程组 self.opt.obj_func = self.opt.object_func3 self.opt.limit.append(self.opt.constraint()) self.opt.bonds = ((0, 1),) * fact.shape[1] # 对收益率进行调整 factWeightNew = factWeight.shift(self.hp + 1).dropna(how='all') weightDict = {} for sub in range(self.rp, factWeightNew.shape[0] + 1): print(dt.datetime.now(), sub) df_ = factWeightNew.iloc[sub - self.rp: sub, :] data_mean = np.array(df_.mean()) if retType == 'IC': data_cov = np.array(fact.loc[df_.index].cov()) else: data_cov = np.array(df_.cov()) optParams = { "data_mean": data_mean, "data_cov": data_cov, } self.opt.set_params(**optParams) res_ = self.opt.solve() weightDict[df_.index[-1]] = res_.x w_df = pd.DataFrame(weightDict, index=fact.columns).T fact_comp = fact.mul(w_df, level=0).dropna(how='all').sum(axis=1).reindex(fact.index) return fact_comp
def __portfolio_returns(w_shifted: pd.DataFrame, r: pd.DataFrame) -> pd.DataFrame: return r.mul(w_shifted).sum(axis=1)
from pandas import DataFrame, Series import pandas as pd ############################## basic functions ################################# # basic functions: get partly info data[['Director','id','Gerne','Runtime']] data.ix[['Crazy Asian','Movie 2'],['Director','Runtime']] # basic functions:math oprations df1 = DataFrame(np.arange(12.).reshape((3,4)), columns=list('abcd')) df2 = DataFrame(np.arange(20.).reshape(4,5), columns=list('abcde')) df1-df2 # auto matching, fill NaN df1.add(df2, fill_value=0) df1.sub(df2, fill_value=0) df1.mul(df2, fill_value=0) df1.div(df2, fill_value=0) df1.reindex(columns=df2.columns, fill_value=0) # function map df1 = DataFrame(np.random.randn(4,3), columns=list('abc'), index=['id1','id2','id3','id4']) np.abs(df1) func1 = lambda x: x.max()-x.min() df1.apply(f, axis=1) def f(x): return ([x.max(),x.min()],index=['max','min']) df1.apply(f) format11 = lambda x: '%.2f' % x
def factor_analysis(self, industry_neutral=True, size_neutral=True, num_group=10): self.factor = self.inf_to_nan(self.factor) stocks = self.stocks start_date = self.start_date end_date = self.end_date y1 = pd.read_csv('%s/Data/y1.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y2 = pd.read_csv('%s/Data/y2.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y3 = pd.read_csv('%s/Data/y3.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y4 = pd.read_csv('%s/Data/y4.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] y5 = pd.read_csv('%s/Data/y5.csv' % gc.LABELBASE_PATH, index_col=[0], parse_dates=[0]).loc[:, stocks] if start_date: y1 = y1.loc[y1.index >= start_date, :] y2 = y2.loc[y2.index >= start_date, :] y3 = y3.loc[y3.index >= start_date, :] y4 = y4.loc[y4.index >= start_date, :] y5 = y5.loc[y5.index >= start_date, :] if end_date: y1 = y1.loc[y1.index <= end_date, :] y2 = y2.loc[y2.index <= end_date, :] y3 = y3.loc[y3.index <= end_date, :] y4 = y4.loc[y4.index <= end_date, :] y5 = y5.loc[y5.index <= end_date, :] self.y1 = y1 self.y2 = y2 self.y3 = y3 self.y4 = y4 self.y5 = y5 if not os.path.exists( '%s/Results/%s/%s' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)): os.mkdir('%s/Results/%s/%s' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) factor = self.factor.copy() #行业中性 if industry_neutral: industrys = tools.get_industrys('L1', self.stocks) tmp = {} for k in industrys.keys(): if len(industrys[k]) > 0: tmp[k] = industrys[k] industrys = tmp factor = tools.standardize_industry(self.factor, industrys) self.factor_industry_neutral = factor.copy() #市值中性 if size_neutral: market_capitalization = DataFrame({ stock: pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv' % (gc.DATABASE_PATH, stock), index_col=[0], parse_dates=[0]).loc[:, 'TOTMKTCAP'] for stock in self.stocks }) market_capitalization = np.log(market_capitalization) if self.start_date: market_capitalization = market_capitalization.loc[ market_capitalization.index >= self.start_date, :] if self.end_date: market_capitalization = market_capitalization.loc[ market_capitalization.index <= self.end_date, :] if industry_neutral: market_capitalization = tools.standardize_industry( market_capitalization, industrys) beta = (factor * market_capitalization).sum(1) / ( market_capitalization * market_capitalization).sum(1) factor = factor - market_capitalization.mul(beta, axis=0) self.factor_industry_size_neutral = factor.copy() # self.factor_industry_neutral.fillna(0, inplace=True) # self.factor_industry_size_neutral.fillna(0, inplace=True) # factor.fillna(0, inplace=True) #因子分布 plt.figure(figsize=(16, 12)) plt.hist(factor.fillna(0).values.flatten()) plt.savefig('%s/Results/%s/%s/hist.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) #IC、IR、分组回测 ys = [self.y1, self.y2, self.y3, self.y4, self.y5] IC = {} IR = {} group_backtest = {} group_pos = {} for i in range(len(ys)): if industry_neutral: y_neutral = tools.standardize_industry(ys[i], industrys) if size_neutral: y_neutral = y_neutral - market_capitalization.mul( (y_neutral * market_capitalization).sum(1) / (market_capitalization * market_capitalization).sum(1), axis=0) IC[i] = (y_neutral * factor).mean(1) / factor.std(1) / y_neutral.std(1) IR[i] = IC[i].rolling(20).mean() / IC[i].rolling(20).std() factor_quantile = DataFrame( rankdata(factor, axis=1), index=factor.index, columns=factor.columns).div(factor.notna().sum(1), axis=0) # / len(factor.columns) factor_quantile[factor.isna()] = np.nan group_backtest[i] = {} group_pos[i] = {} for n in range(num_group): group_pos[i][n] = DataFrame((n / num_group <= factor_quantile) & (factor_quantile <= (n + 1) / num_group)) group_pos[i][n][~group_pos[i][n]] = np.nan group_pos[i][n] = 1 * group_pos[i][n] group_backtest[i][n] = ((group_pos[i][n] * ys[i]).mean(1) - ys[i].mean(1)).cumsum().rename( '%s' % (n / num_group)) self.IC = IC self.IR = IR self.group_pos = group_pos self.group_backtest = group_backtest plt.figure(figsize=(16, 12)) for i in range(len(ys)): IC[i].cumsum().plot() plt.legend(['%s' % i for i in range(len(ys))]) plt.savefig('%s/Results/%s/%s/IC.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) plt.figure(figsize=(16, 12)) for i in range(len(ys)): IR[i].cumsum().plot() plt.legend(['%s' % i for i in range(len(ys))]) plt.savefig('%s/Results/%s/%s/IR.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method)) for i in range(len(ys)): plt.figure(figsize=(16, 12)) for n in range(num_group): group_backtest[i][n].plot() plt.legend(['%s' % i for i in range(num_group)]) plt.savefig( '%s/Results/%s/%s/groupbacktest%s.png' % (gc.MULTIFACTOR_PATH, self.factor_name, self.method, i))
df1 df2 = DataFrame(np.arange(12).reshape(3,4), index=['2014','2015','2016'], columns = ['python','r','sql','plsql']) df2 # df1 + df2 df1.add(df2, fill_value=0) # df1.sub(df2, fill_value=0) # df1.mul(df2, fill_value=1) # df1.mul(df2, fill_value=1) ■ 브로드캐스팅(broadcasting): 하나의 값을 계속 중개해줌 obj1 = np.arange(15).reshape(5,3) obj1 obj2 = np.arange(3) obj2 #브로드캐스팅: obj2 값은 [0,1,2]이므로 이 값이 계속 obj1의 다른 행에도 계속 돌아가면서 계산된다. #obj2가 obj1에 맞게 모양을 바꾼다. obj1 + obj2
def preprocess_res_sheet( res_sheet: pd.DataFrame, res_conversion_factors: pd.Series, template_index: pd.Index, years: List, ): # First column includes the index data sheet_index = pd.Index(res_sheet.iloc[:, 0]) # Exclude empty prevailing columns res_sheet = res_sheet.iloc[:, 19 : 19 + len(years)] res_sheet.columns = years # Check for differences amongst indices deviations = sheet_index.difference(template_index) if deviations is not None: logging.getLogger().warning("\n\t{} WARNING {}".format("*" * 33, "*" * 33)) deviation_idx = [] for index in [ "Verbrauch Sektor Energie: E5 & E7 (TJ)", "Verluste: E5 & E7 (TJ)", "Anrechenbare Erneuerbare (TJ)", "Anteil Erneuerbarer Energieträger insgesamt", "Anteil anrechenbare Erneuerbare Landwirtschaft", "Primärstrom Wasser real mit Pumpe (MWh)", "Primärstrom Wasser real ohne Pumpe (MWh)", "Umgebungswärme (anrechenbarer Anteil)", ]: if index in deviations: deviation_idx.append(index) deviations = deviations.drop([index]) logging.getLogger().warning( r"\Deviating indices in res data:\n\t{}".format(deviation_idx) ) # Set first column with indices as df index res_sheet.set_index(sheet_index, drop=True, inplace=True) # Drop rows that are not used res_sheet.drop( index=deviations, axis=0, inplace=True, errors="raise", ) res_sheet = res_sheet.apply(pd.to_numeric, errors="coerce").round(2) # TODO: Replace with appropriate function # Turn all string values to NaN for i in res_sheet.index: for j in res_sheet.columns: if isinstance(res_sheet.loc[i, j], str): res_sheet.loc[i, j] = np.nan # Convert all values to TJ and MW res_sheet = res_sheet.mul(res_conversion_factors, axis=0) return res_sheet
print(data[:2]) #通过切片获取数据 print(data[data['a1'] > 4]) #通过bool索引获取数据 print(data['a1']) print(data.ix['a', ['a1', 'b1']]) #获取第一个参数是行索引,第二个是列索引的范围 print(data.ix[data.a1 > 4, :3]) print(data.ix['a', 'a1']) #算数运算和数据对齐 print('---------------------------------------') d1 = DataFrame(np.arange(15).reshape(3, 5), index=list('abc')) d2 = DataFrame(np.arange(20).reshape(4, 5), index=list('abcd')) print(d1 + d2) #相加会自动在不重叠的索引处引入NaN,也可以设置默认缺失值 print(d1.add(d2, fill_value=0)) #加 print(d1.sub(d2, fill_value=0)) #减 print(d1.div(d2, fill_value=0)) #除 print(d1.mul(d2, fill_value=0)) #乘 print('-----------------Dataframe与Series之间的运算----------------------') #Dataframe与Series之间的运算 arr = DataFrame( np.arange(15).reshape(5, 3), columns=list('abc'), index=['ShangHai', 'BeiJing', 'Changsha', 'Hangzhou', 'Fujian']) series = arr.ix[0] #获取第一行数据 print(arr) print(series) print(arr - series) #默认情况下,DF和Series之间的算术运算会将Series的索引匹配到DF的列,然后向下广播 print(arr.sub(arr['a'], axis=0)) #在列上进行广播 print('-----------------Dataframe函数应用和映射----------------------') frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),
def test_binary_ops_align(self): # test aligning binary ops # GH 6681 index = MultiIndex.from_product( [list("abc"), ["one", "two", "three"], [1, 2, 3]], names=["first", "second", "third"], ) df = DataFrame( np.arange(27 * 3).reshape(27, 3), index=index, columns=["value1", "value2", "value3"], ).sort_index() idx = pd.IndexSlice for op in ["add", "sub", "mul", "div", "truediv"]: opa = getattr(operator, op, None) if opa is None: continue x = Series([1.0, 10.0, 100.0], [1, 2, 3]) result = getattr(df, op)(x, level="third", axis=0) expected = pd.concat([ opa(df.loc[idx[:, :, i], :], v) for i, v in x.items() ]).sort_index() tm.assert_frame_equal(result, expected) x = Series([1.0, 10.0], ["two", "three"]) result = getattr(df, op)(x, level="second", axis=0) expected = (pd.concat([ opa(df.loc[idx[:, i], :], v) for i, v in x.items() ]).reindex_like(df).sort_index()) tm.assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) midx = MultiIndex.from_product([["A", "B"], ["a", "b"]]) df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx) s = pd.Series({"a": 1, "b": 2}) df2 = df.copy() df2.columns.names = ["lvl0", "lvl1"] s2 = s.copy() s2.index.name = "lvl1" # different cases of integer/string level names: res1 = df.mul(s, axis=1, level=1) res2 = df.mul(s2, axis=1, level=1) res3 = df2.mul(s, axis=1, level=1) res4 = df2.mul(s2, axis=1, level=1) res5 = df2.mul(s, axis=1, level="lvl1") res6 = df2.mul(s2, axis=1, level="lvl1") exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx) for res in [res1, res2]: tm.assert_frame_equal(res, exp) exp.columns.names = ["lvl0", "lvl1"] for res in [res3, res4, res5, res6]: tm.assert_frame_equal(res, exp)
import numpy as np from pandas import DataFrame npdata = np.random.randn(5, 3) columnNames = ['x1', 'x2', 'x3'] data = DataFrame(npdata, columns=columnNames) print('data =') print(data) columnNames = ['x1', 'x2', 'x3'] data2 = DataFrame(np.random.randn(5, 3), columns=columnNames) print('\ndata2 =') print(data2) print('\ndata + data2 = ') print(data.add(data2)) print('\ndata * data2 = ') print(data.mul(data2))
def de_zscore_to_val(df_zscore: pd.DataFrame, df_mean_base: pd.DataFrame, series_std: pd.Series) -> pd.DataFrame: df = df_zscore.mul(series_std, axis=1) df = df.add(df_mean_base.to_numpy()[0],axis=1) return df
def test_binary_ops_align(self): # test aligning binary ops # GH 6681 index = MultiIndex.from_product( [list('abc'), ['one', 'two', 'three'], [1, 2, 3]], names=['first', 'second', 'third']) df = DataFrame(np.arange(27 * 3).reshape(27, 3), index=index, columns=['value1', 'value2', 'value3']).sort_index() idx = pd.IndexSlice for op in ['add', 'sub', 'mul', 'div', 'truediv']: opa = getattr(operator, op, None) if opa is None: continue x = Series([1.0, 10.0, 100.0], [1, 2, 3]) result = getattr(df, op)(x, level='third', axis=0) expected = pd.concat([ opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems() ]).sort_index() assert_frame_equal(result, expected) x = Series([1.0, 10.0], ['two', 'three']) result = getattr(df, op)(x, level='second', axis=0) expected = (pd.concat([ opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems() ]).reindex_like(df).sort_index()) assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']]) df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx) s = pd.Series({'a': 1, 'b': 2}) df2 = df.copy() df2.columns.names = ['lvl0', 'lvl1'] s2 = s.copy() s2.index.name = 'lvl1' # different cases of integer/string level names: res1 = df.mul(s, axis=1, level=1) res2 = df.mul(s2, axis=1, level=1) res3 = df2.mul(s, axis=1, level=1) res4 = df2.mul(s2, axis=1, level=1) res5 = df2.mul(s, axis=1, level='lvl1') res6 = df2.mul(s2, axis=1, level='lvl1') exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'), columns=midx) for res in [res1, res2]: assert_frame_equal(res, exp) exp.columns.names = ['lvl0', 'lvl1'] for res in [res3, res4, res5, res6]: assert_frame_equal(res, exp)
s1 = Series([1, 2, 3], index=['a', 'b', 'c']) s2 = Series([4, 5, 6, 7], index=['a', 'b', 'd', 'c']) print(s1) print(s2) print(s1 + s2) print(s1.add(s2)) # 같은 인덱스명이 대응될 때 연산 가능(인덱스가 같아야 한다) print() df1 = DataFrame(np.arange(9.).reshape(3, 3), columns=list('kbs'), index=['서울', '인천', '수원']) print(df1) df2 = DataFrame(np.arange(12.).reshape(4, 3), columns=list('kbs'), index=['서울', '인천', '일산', '수원']) print(df2) print() print(df1 + df2) # 얘는 속성을 쓸 수 없다. print(df1.add(df2)) # 얘는 속성을 쓸 수 있다. print(df1.add(df2, fill_value=0)) # 얘는 속성(ex. fill_value)를 쓸 수 있다. print() print(df1 + df2) print(df1.mul(df2)) # mul : 곱하기 print(df1.mul(df2, fill_value=0)) print() seri = df1.iloc[0] # df1의 1열 모두 출력 print(seri) print(df1 - seri) # broadcasting되서 연산 가능
def test_binary_ops_align(self): # test aligning binary ops # GH 6681 index = MultiIndex.from_product([list('abc'), ['one', 'two', 'three'], [1, 2, 3]], names=['first', 'second', 'third']) df = DataFrame(np.arange(27 * 3).reshape(27, 3), index=index, columns=['value1', 'value2', 'value3']).sort_index() idx = pd.IndexSlice for op in ['add', 'sub', 'mul', 'div', 'truediv']: opa = getattr(operator, op, None) if opa is None: continue x = Series([1.0, 10.0, 100.0], [1, 2, 3]) result = getattr(df, op)(x, level='third', axis=0) expected = pd.concat([opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems()]).sort_index() assert_frame_equal(result, expected) x = Series([1.0, 10.0], ['two', 'three']) result = getattr(df, op)(x, level='second', axis=0) expected = (pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems()]) .reindex_like(df).sort_index()) assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']]) df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx) s = pd.Series({'a': 1, 'b': 2}) df2 = df.copy() df2.columns.names = ['lvl0', 'lvl1'] s2 = s.copy() s2.index.name = 'lvl1' # different cases of integer/string level names: res1 = df.mul(s, axis=1, level=1) res2 = df.mul(s2, axis=1, level=1) res3 = df2.mul(s, axis=1, level=1) res4 = df2.mul(s2, axis=1, level=1) res5 = df2.mul(s, axis=1, level='lvl1') res6 = df2.mul(s2, axis=1, level='lvl1') exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'), columns=midx) for res in [res1, res2]: assert_frame_equal(res, exp) exp.columns.names = ['lvl0', 'lvl1'] for res in [res3, res4, res5, res6]: assert_frame_equal(res, exp)
print(frame2) # frame 덧셈 add = frame1.add(frame2) print(add) # frame 뺄셈 sub = frame2.sub(frame1) print(sub) # frame 나눗셈 div = frame2 / frame1 div = frame2.div(frame1) print(div) # inf : 부모가 0인 경우 # frame 곱셈 mul = frame1.mul(frame2) print(mul) # 행/열 단위 합계/평균/최댓값/최솟값 sum1 = mul.sum(axis = 1) # 행 단위 sum2 = mul.sum(axis = 0) # 열 단위 print('행 단위 합계:\n',sum1) print('열 단위 합계:\n',sum2) avg1 = mul.mean(axis = 1) # 행 단위 평균 avg2 = mul.mean(axis = 0) # 열 단위 평균 print('행 단위 평균:\n',avg1) print('열 단위 평균:\n',avg2)