Пример #1
0
    def test_binary_ops_align(self):

        # test aligning binary ops

        # GH 6681
        index = MultiIndex.from_product(
            [list("abc"), ["one", "two", "three"], [1, 2, 3]], names=["first", "second", "third"]
        )

        df = DataFrame(
            np.arange(27 * 3).reshape(27, 3), index=index, columns=["value1", "value2", "value3"]
        ).sortlevel()

        idx = pd.IndexSlice
        for op in ["add", "sub", "mul", "div", "truediv"]:
            opa = getattr(operator, op, None)
            if opa is None:
                continue

            x = Series([1.0, 10.0, 100.0], [1, 2, 3])
            result = getattr(df, op)(x, level="third", axis=0)

            expected = pd.concat([opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems()]).sortlevel()
            assert_frame_equal(result, expected)

            x = Series([1.0, 10.0], ["two", "three"])
            result = getattr(df, op)(x, level="second", axis=0)

            expected = pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems()]).reindex_like(df).sortlevel()
            assert_frame_equal(result, expected)

        # GH9463 (alignment level of dataframe with series)

        midx = MultiIndex.from_product([["A", "B"], ["a", "b"]])
        df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx)
        s = pd.Series({"a": 1, "b": 2})

        df2 = df.copy()
        df2.columns.names = ["lvl0", "lvl1"]
        s2 = s.copy()
        s2.index.name = "lvl1"

        # different cases of integer/string level names:
        res1 = df.mul(s, axis=1, level=1)
        res2 = df.mul(s2, axis=1, level=1)
        res3 = df2.mul(s, axis=1, level=1)
        res4 = df2.mul(s2, axis=1, level=1)
        res5 = df2.mul(s, axis=1, level="lvl1")
        res6 = df2.mul(s2, axis=1, level="lvl1")

        exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx)

        for res in [res1, res2]:
            assert_frame_equal(res, exp)

        exp.columns.names = ["lvl0", "lvl1"]
        for res in [res3, res4, res5, res6]:
            assert_frame_equal(res, exp)
Пример #2
0
def getRecommendations(dfobj, people, similarity=sim_pearson):
    dfobj2 = dfobj.drop(people, axis=1)
    cor_df = Series(dict([(p, similarity(dfobj, p, people)) for p in dfobj2]))
    mul_df = DataFrame.mul(dfobj2.T, cor_df[cor_df > 0], axis=0)
    return Series.div(
        mul_df.sum(),
        DataFrame.mul(pd.notnull(mul_df), cor_df,
                      axis=0).sum()).order(ascending=False)[pd.isnull(
                          dfobj[people])]
Пример #3
0
 def update_factor(self):
     self.generate_factor()
     #if 'industry' in self.neutral_list:
     if True:
         industrys = tools.get_industrys('L1', self.stocks)
         tmp = {}
         for k in industrys.keys():
             if len(industrys[k]) > 0:
                 tmp[k] = industrys[k]
         industrys = tmp
         factor = tools.standardize_industry(self.factor, industrys)
     #if 'market_capitalization' in self.neutral_list:
     if False:
         market_capitalization = DataFrame({stock: pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv'%(gc.DATABASE_PATH, stock), index_col=[0], parse_dates=[0]).loc[:, 'TOTMKTCAP'] for stock in self.stocks})
         market_capitalization = np.log(market_capitalization)
         if self.start_date:
             market_capitalization = market_capitalization.loc[market_capitalization.index >= self.start_date, :]
         if self.end_date:
             market_capitalization = market_capitalization.loc[market_capitalization.index <= self.end_date, :]
         #if 'industry' in self.neutral_list:
         if True:
             market_capitalization = tools.standardize_industry(market_capitalization, industrys)
         beta = (factor * market_capitalization).sum(1) / (market_capitalization * market_capitalization).sum(1)
         factor = factor - market_capitalization.mul(beta, axis=0)
     self.factor.fillna(0, inplace=True)
     if os.path.exists('%s/Data/%s.csv'%(gc.FACTORBASE_PATH, self.factor_name)):
         factor_old = pd.read_csv('%s/Data/%s.csv'%(gc.FACTORBASE_PATH, self.factor_name), index_col=[0])
         factor = pd.concat([factor_old, factor.loc[factor.index>factor.index[-1], :]], axis=0)
         factor.sort_index(axis=0, inplace=True)
         factor.sort_index(axis=1, inplace=True)
     factor.to_csv('%s/Data/%s.csv'%(gc.FACTORBASE_PATH, self.factor_name))
Пример #4
0
    def chi(self, customattribute):
        """
        计算其卡方值.
        """
        attributeDict = dict()
        classAttributeDict = dict()
        for piece in self.chunks:
            for (attribute, classAttribute), arrays in piece.groupby([customattribute, self.classAttribute]).studentID.unique().iteritems():
                attributeDict.setdefault((attribute, classAttribute), np.array([]))
                attributeDict[(attribute, classAttribute)] = np.union1d(attributeDict[(attribute, classAttribute)], arrays)

            for classAttribute, arrays in piece.groupby(self.classAttribute).studentID.unique().iteritems():
                classAttributeDict.setdefault(classAttribute, np.array([]))
                classAttributeDict[classAttribute] = np.union1d(classAttributeDict[classAttribute], arrays)

        #各个类别的毕业去向群体中所占的比例.
        classSeries = Series(classAttributeDict).apply(lambda x:len(x))
        classSeries /= classSeries.sum()

        #在各个attribute上的实际观测值.
        attributeObs = Series(attributeDict).apply(lambda x:len(x)).unstack(fill_value=0)

        attributeExp = DataFrame(index=attributeObs.index, columns=attributeObs.columns)

        #设置初始值.
        for index in attributeExp.index:
            attributeExp.ix[index] = attributeObs.ix[index].sum()
        #根据各个目标类别中的比例来获得其期望值.
        attributeExp = attributeExp.mul(classSeries).fillna(0)
        #根据实际观测值与期望值来计算其卡方值,并返回p-value值.
        return chisquare(attributeObs.stack(), attributeExp.stack()), attributeObs
Пример #5
0
    def return_weight(self,
                      fact: pd.DataFrame,
                      fact_ret: pd.DataFrame = None,
                      hp: int = 1,
                      rp: int = 20,
                      algorithm='mean') -> [pd.Series, None]:
        """
        由于该地方的权重(Pearson相关性和Spearman相关性)权重都是作为标签参与了运算,
        因此相对于截面当期该数据为未来数据,需要进行平移后与相应的因子进行匹配才能作为当期截面因子的历史权重,
        系统默认计算收益率采用open价格,所以,若调仓周期为N天,则需要平移 N + 1 + 1天。
        :param fact: 标准化后的因子
        :param fact_ret: 因子收益率
        :param rp: 权重滚动计算周期
        :param hp: 标的持有周期(调仓周期)
        :param algorithm: 权重计算方法
        :return:
        """

        fact_weight = abs(self._weight(fact_ret, rp, algorithm))

        # 权重归一化
        fact_weight_std = fact_weight.div(fact_weight.sum(axis=1), axis=0)
        # 权重与因子值匹配
        fact_weight_std = fact_weight_std.shift(hp + 1)  # TODO 不同的价格平移周期不一样
        # 复合因子
        fact_comp = fact.mul(fact_weight_std).sum(axis=1)

        return fact_comp
Пример #6
0
    def MAX_IC_IR(self,
                  fact: pd.DataFrame,
                  fact_ret: pd.DataFrame = None,
                  hp: int = 1,
                  rp: int = 20,
                  way='IC_IR',
                  comp_name: str = 'comp_factor'):

        # 对收益率进行调整
        ret_real = fact_ret.shift(hp).dropna()

        w_list = []
        for i in range(rp, ret_real.shape[0] + 1):
            df_ = ret_real.iloc[i - rp:i, :]
            opt = self.OPT(df_)

            if way == 'IC':
                opt.data_cov = np.array(fact.loc[df_.index].cov())

            res_ = opt.solve()
            weight_ = res_.x
            w_s = pd.Series(weight_, index=df_.columns, name=df_.index[-1])
            w_list.append(w_s)

        w_df = pd.DataFrame(w_list)
        # W = w_df.shift(hp)
        fact_comp = fact.mul(w_df).sum(axis=1)
        fact_comp.name = fact_comp
        return fact_comp
Пример #7
0
    def retWeighted(self,
                    fact: pd.DataFrame,
                    factWeight: pd.DataFrame,
                    algorithm: str = 'RetMean',
                    **kwargs) -> pd.Series(float):
        """

        Parameters
        ----------
        factWeight :
        fact :
        algorithm : RetMean: 历史收益率均值, HalfTime: 历史收益率半衰加权
        kwargs :

        Returns
        -------

        """

        if algorithm != 'equal':
            # 生成权重
            factWeightNew = abs(self._weight(factWeight, self.rp, algorithm))
            # 权重归一化
            factWeightStand = factWeightNew.div(factWeightNew.sum(axis=1), axis=0)
            # 权重与因子值匹配
            factWeightStand = factWeightStand.shift(self.hp + 1)
            # 复合因子
            fact_comp = fact.mul(factWeightStand).sum(axis=1)
        else:
            fact_comp = fact.groupby(KN.TRADE_DATE.value, group_keys=False).apply(lambda x: x.mean(axis=1))
        return fact_comp
Пример #8
0
 def multiple_predictions(self, features: DataFrame) -> Series:
     rows, columns = features.shape
     if len(self.gradients) != columns:
         raise ValueError(
             f"Features not the same length as gradients! Features: {columns}, Gradients: {len(self.gradients)}"
         )
     return features.mul(self.gradients).sum(1).add(self.theta_0)
Пример #9
0
    def structural_adj(self,
                       cov: pd.DataFrame,
                       spec_ret: pd.DataFrame,
                       fact_exp: pd.DataFrame,
                       liq_mv: pd.DataFrame,
                       liq_mv_name: PVN.LIQ_MV.value,
                       time_window: int = 120):
        """

        :param cov: 经Newey-West调整的个股特异收益矩阵
        :param spec_ret: 个股特异收益序列
        :param fact_exp: 因子暴露
        :param liq_mv: 流通市值
        :param liq_mv_name: 流通市值名称
        :param time_window: 个股特异收益的时间窗口(后面考虑改为特异收益序列的长度)
        :return:
        """
        # 计算协调参数
        h_n = spec_ret.count()  # 非空数量
        V_n = (h_n - 20 / 4) / 20 * 2  # 数据缺失程度(先用20测试)

        sigma_n = spec_ret.std().fillna(1)  # 样本等权标准差(无法计算的标准差记为1)  TODO

        sigma_n_steady = (spec_ret.quantile(.75) -
                          spec_ret.quantile(0.25)) / 1.35  # 样本稳健估计标准差

        Z_n = abs((sigma_n - sigma_n_steady) / sigma_n_steady)  # 数据肥尾程度

        # 将无限大值替换为0
        Z_n[np.isinf(Z_n)] = 0
        Z_n.fillna(0, inplace=True)

        left_, right_ = V_n.where(V_n > 0, 0), np.exp(1 - Z_n)

        left_, right_ = left_.where(left_ < 1, 1), right_.where(right_ < 1, 1)
        gam_n = left_ * right_  # 个股协调参数[0,1]

        reg_data = pd.concat([np.log(sigma_n), liq_mv, gam_n, fact_exp],
                             axis=1)
        reg_data.columns = ['sigma', liq_mv_name, 'gam_n'
                            ] + fact_exp.columns.tolist()

        ref_data_com = reg_data[reg_data['gam_n'] == 1]

        # 加权(流通市值)最小二乘法用优质股票估计因子对特异波动的贡献值
        model = sm.WLS(ref_data_com['sigma'],
                       ref_data_com[fact_exp.columns],
                       weights=ref_data_com['gam_n']).fit()

        # 个股结构化特异波动预测值
        sigma_STR = pd.DataFrame(np.diag(
            np.exp(np.dot(fact_exp, model.params)) * 1.05),
                                 index=fact_exp.index,
                                 columns=fact_exp.index)

        # 对特异收益矩阵进行结构化调整
        F_STR = sigma_STR.mul((1 - gam_n), axis=0) + cov.mul(gam_n, axis=0)

        return F_STR
Пример #10
0
def mean_aggregate(
    indices: pd.DataFrame,
    weight_shares: FrameOrSeriesUnion,
    axis: int = 1,
) -> pd.Series:
    """Aggregates indices and weight shares using sum product."""
    # min_count set to 1 to prevent function returning 0 when all
    # values being summed are NA
    return indices.mul(weight_shares).sum(axis=axis, min_count=1)
Пример #11
0
def partial_correlation(C,X,S,N):
    S = DataFrame(S)
    C = Series(C)
    X = Series(X)
    #linear fit between S and C
    linreg = linear_model.LinearRegression()
    linreg.fit(S,C)
    coef_c=linreg.coef_
    intercept_c = linreg.intercept_
    R_c=C-(S.mul(coef_c,axis=1).sum()+intercept_c)#residuals of C and S
    
    #linear fit between S and X
    linreg.fit(S,X)
    coef_x = linreg.coef_
    intercept_x = linreg.intercept_
    R_x=X-(S.mul(coef_x,axis=1).sum()+intercept_x)#residuals for X and S
    numerator = N*(R_c*R_x).sum() - R_c.sum()*R_x.sum()
    denominator=np.sqrt(N*np.square(R_c).sum()-np.square(R_c.sum()))*\
        np.sqrt(N*np.square(R_x).sum()-np.square(R_x.sum()))
    rou = numerator*1.0/denominator
    return rou
Пример #12
0
    def __get_a_portfolio(self, df: pd.DataFrame) -> Union[pd.DataFrame, None]:
        """
        The method is needed to get the portfolio.

        The method for calculating the portfolio by multiplying
        the previously calculated table by the weight
        and summing the columns

        :param df: R[i,t], CR[i,t] or TR[i,t].
        :return: Portfolio DataFrame
        """

        if self._df_raw['weights'] is None:
            return None
        return df.mul(self._df_raw['weights']).sum(axis=1, skipna=False)
Пример #13
0
    def PCA(self, fact: pd.DataFrame, rp: int = 20):

        w_list = []
        for i in range(rp, fact.shape[0] + 1):
            df_ = fact.iloc[i - rp:i, :]

            pca = PCA(n_components=1)
            pca.fit(np.array(df_))
            weight = pca.components_[0]
            w_s = pd.Series(data=weight, index=df_.columns, name=df_.index[-1])
            w_list.append(w_s)
        w_df = pd.DataFrame(w_list)

        fact_comp = fact.mul(w_df).sum(axis=1)
        fact_comp.name = fact_comp

        return fact_comp
Пример #14
0
def optPort_nco(cov, mu=None, maxNumClusters=10):
    # Portfolio optimizataion function using NCO method
    cov = DataFrame(cov)
    if mu is not None:
        mu = Series(mu[:, 0])
    corr1 = cov2corr(cov)
    corr1, clstrs, _ = clusterKMeansBase(corr1, maxNumClusters, n_init=10)
    wIntra = DataFrame(0, index=cov.index, columns=clstrs.keys())
    for i in clstrs:
        cov_ = cov.loc[clstrs[i], clstrs[i]].values
        if mu is None:
            mu_ = None
        else:
            mu_ = mu.loc[clstrs[i]].values.reshape(-1, 1)
        wIntra.loc[clstrs[i], i] = optPort(cov_, mu_).flatten()
    cov_ = wIntra.T.dot(np.dot(cov, wIntra))
    mu_ = (None if mu is None else wIntra.T.dot(mu))
    wInter = Series(optPort(cov_, mu_).flatten(), index=cov_.index)
    nco = wIntra.mul(wInter, axis=1).sum(axis=1).values.reshape(-1, 1)
    return nco
Пример #15
0
def _calculate_delta(resp_matrix: pd.DataFrame, meas_dict: dict,
                     keys: Sequence[str], vars_list: Sequence[str],
                     method: str, meth_opt):
    """Get the deltas for the variables.

    Output is Dataframe with one column 'DELTA' and vars_list index."""
    weight_vector = _join_columns(f"{WEIGHT}", meas_dict, keys)
    diff_vector = _join_columns(f"{DIFF}", meas_dict, keys)

    resp_weighted = resp_matrix.mul(weight_vector, axis="index")
    diff_weighted = diff_vector * weight_vector

    delta = _get_method_fun(method)(resp_weighted, diff_weighted, meth_opt)
    delta = tfs.TfsDataFrame(delta, index=vars_list, columns=[DELTA])

    # check calculations
    update = np.dot(resp_weighted, delta[DELTA])
    _print_rms(meas_dict, diff_weighted, update)

    return delta
Пример #16
0
    def generate_factor(self):
        CLOSE = DataFrame({
            stock: pd.read_csv('%s/StockDailyData/Stock/%s.csv' %
                               (gc.DATABASE_PATH, stock),
                               index_col=[0],
                               parse_dates=[0]).loc[:, 'close']
            for stock in self.stocks
        })
        ADJ = DataFrame({
            stock: pd.read_csv('%s/StockDailyData/Stock/%s.csv' %
                               (gc.DATABASE_PATH, stock),
                               index_col=[0],
                               parse_dates=[0]).loc[:, 'adj_factor']
            for stock in self.stocks
        })
        CLOSE = CLOSE * ADJ
        CLOSE.fillna(method='ffill', inplace=True)
        r = np.log(CLOSE).diff()
        r_m = r.mean(1)
        r_m = DataFrame({stock: r_m for stock in r.columns})
        n = 20

        def reg(y, x, n):
            lxx = (x**2).rolling(n).sum() - n * (x.rolling(n).mean()**2)
            lxy = (x * y).rolling(
                n).sum() - n * x.rolling(n).mean() * y.rolling(n).mean()
            beta = lxy / lxx
            alpha = y.rolling(n).mean() - beta * x.rolling(n).mean()

            return alpha, beta

        alpha, beta = reg(r, r_m, n)
        e = r.subtract(alpha, 1) - r_m.mul(beta, 1)

        m = n
        a = e.rolling(m).std()
        a = np.log(a)
        a = a.loc[a.index >= self.start_date, :]
        a = a.loc[a.index <= self.end_date, :]

        self.factor = a
    def placeAnalysize(self, placeAttribute, amountAttribute):
        placeAttributeDict = dict()
        classAttributeDict = dict()
        for piece in self.chunks:
            for work, df in piece[piece['type'] == '消费'].groupby(
                    self.classAttribute):
                classAttributeDict.setdefault(work, Series([]))
                classAttributeDict[work] = classAttributeDict[work].add(
                    df.groupby('studentID')[amountAttribute].sum(),
                    fill_value=0)
            for (place, work), df in piece[piece['type'] == '消费'].groupby(
                [placeAttribute, self.classAttribute]):
                placeAttributeDict.setdefault((place, work), Series([]))
                placeAttributeDict[(place, work)] = placeAttributeDict[(
                    place,
                    work)].add(df.groupby('studentID')[amountAttribute].sum(),
                               fill_value=0)

        #获得每类群体的实际观测值.
        dfObs = Series(placeAttributeDict).apply(lambda x: x.mean()).unstack(
            fill_value=0)

        #获得每类群体的消费金额占总体消费金额的实际比例.相当于先验概率.
        placeAttributeExp = Series(classAttributeDict).apply(lambda x: x.sum())
        placeAttributeExp = placeAttributeExp / placeAttributeExp.sum()
        placeAttributeExp.fillna(0)

        #根据每类群体的理论分布值来计算其每类群体的理论观测值.
        dfExp = DataFrame([], index=dfObs.index, columns=dfObs.columns)
        df = Series(placeAttributeDict).apply(lambda x: x.sum()).unstack(
            fill_value=0)
        for index in dfObs.index:
            dfExp.ix[index] = df.ix[index].sum()
        dfExp = dfExp.mul(placeAttributeExp).fillna(0) / (Series(
            placeAttributeDict).apply(lambda x: x.size)).unstack(fill_value=0)

        #注意理解卡方的计算方式,期望值为0时,则所计算出的卡方值是会有问题的.
        dfExp = dfExp.replace([np.inf, -np.inf], np.nan).fillna(0.000001)

        #返回其计算的卡方值,pvalue值,其期望的值与实际观测的值.
        return chisquare(dfObs.stack(), dfExp.stack()), dfExp, dfObs
Пример #18
0
def add_noise(df: DataFrame, a=-1, b=1, method='add') -> DataFrame:
    """
    Add noise to all rows in a DataFrame
    :param df: The DataFrame we want to add noise to
    :param a: lower limit of random number that we'll use
    :param b: upper limit of random number that we'll use
    :param method: 'add' or 'mul'. The method used to add noise: add or multiply every value with a random value
    :return: A new DataFrame with random numbers between 'a' and 'b' added to every row
    """
    import numpy as np

    rand_vector = (b - a) * np.random.random_sample(
        (len(df), len(df.columns))) + a
    print(rand_vector)

    if method == 'add':
        return df.add(rand_vector, axis=0)
    if method == 'mul':
        return df.mul(rand_vector, axis=0)

    raise AssertionError("Method must be either 'add' or 'mul'")
Пример #19
0
    def chi(self, customattribute):
        """
        计算其卡方值.
        """
        attributeDict = dict()
        classAttributeDict = dict()
        for piece in self.chunks:
            for (attribute, classAttribute), arrays in piece.groupby(
                [customattribute,
                 self.classAttribute]).studentID.unique().iteritems():
                attributeDict.setdefault((attribute, classAttribute),
                                         np.array([]))
                attributeDict[(attribute, classAttribute)] = np.union1d(
                    attributeDict[(attribute, classAttribute)], arrays)

            for classAttribute, arrays in piece.groupby(
                    self.classAttribute).studentID.unique().iteritems():
                classAttributeDict.setdefault(classAttribute, np.array([]))
                classAttributeDict[classAttribute] = np.union1d(
                    classAttributeDict[classAttribute], arrays)

        #各个类别的毕业去向群体中所占的比例.
        classSeries = Series(classAttributeDict).apply(lambda x: len(x))
        classSeries /= classSeries.sum()

        #在各个attribute上的实际观测值.
        attributeObs = Series(attributeDict).apply(lambda x: len(x)).unstack(
            fill_value=0)

        attributeExp = DataFrame(index=attributeObs.index,
                                 columns=attributeObs.columns)

        #设置初始值.
        for index in attributeExp.index:
            attributeExp.ix[index] = attributeObs.ix[index].sum()
        #根据各个目标类别中的比例来获得其期望值.
        attributeExp = attributeExp.mul(classSeries).fillna(0)
        #根据实际观测值与期望值来计算其卡方值,并返回p-value值.
        return chisquare(attributeObs.stack(),
                         attributeExp.stack()), attributeObs
Пример #20
0
    def process(self,
                data: pd.DataFrame,
                factWeight: pd.DataFrame,
                method: str = 'Equal',
                rp: int = 60,
                hp: int = 5,
                **kwargs) -> pd.DataFrame:
        """
        部分权重会用到未来数据,所以需要对权重进行平移与相应的因子值进行匹配
        Parameters
        ----------
        hp : 持有期
        rp : 滚动周期
        data : 因子集
        factWeight :因子权重
        method : 因子合成方法
        kwargs :

        Returns
        -------

        """
        self.rp, self.hp = rp, hp

        factDir = np.sign(factWeight.rolling(rp, min_periods=1).mean())
        factDir = factDir.shift(hp + 1)  # 收益率为标签(预测值), 历史收益数据加权需要+ 1

        # 因子转为正向因子,同时因子收益等指标调整为单调状态
        factNew = data.mul(factDir, level=0).dropna()
        factWeightNew = factWeight.abs()

        method_dict = {"RetWeight": self.retWeighted,
                       "OPT": self.MAX_IC_IR
                       }

        if method is None:
            return data

        res = method_dict[method](fact=factNew, factWeight=factWeightNew, **kwargs)
        return res
Пример #21
0
def dataFrameMathTest():
    #Note : The methods that return a series default to working on columns.
    df = DataFrame()
    # Load a DataFrame from a CSV file    
    org_df = pd.read_csv('mlg.csv')
    df = org_df.iloc[:,1:7]
    
    resAbs = df.abs() # absolute values
    print(resAbs)
    #resAdd = df.add(o) # add df, Series or value
    #print(resAdd)
    resCount = df.count() # non NA/null values
    print(resCount)
    resCumMax = df.cummax() # (cols default axis)
    print(resCumMax)
    resCumMin = df.cummin() # (cols default axis)
    print(resCumMin)
    resCumSum = df.cumsum() # (cols default axis)
    print(resCumSum)
    resDiff = df.diff() # 1st diff (col def axis)
    print(resDiff)
    resDiv = df.div(12) # div by df, Series, value
    print(resDiv)
    #resDot = df.dot(13) # matrix dot product
    #print(resDot)
    resMax = df.max() # max of axis (col def)
    print(resMax)
    resMean = df.mean() # mean (col default axis)
    print(resMean)
    resMedian = df.median()# median (col default)
    print(resMedian)
    resMin = df.min() # min of axis (col def)
    print(resMin)
    resMul = df.mul(2) # mul by df Series val
    print(resMul)
    resSum = df.sum() # sum axis (cols default)
    print(resSum)
    resWhere = df.where(df > 0.5, other=np.nan)
    print(resWhere)
Пример #22
0
    def MAX_IC_IR(self,
                  fact: pd.DataFrame,
                  factWeight: pd.DataFrame,
                  retType='IC_IR') -> pd.Series(float):

        # 设置优化方程组
        self.opt.obj_func = self.opt.object_func3
        self.opt.limit.append(self.opt.constraint())
        self.opt.bonds = ((0, 1),) * fact.shape[1]

        # 对收益率进行调整
        factWeightNew = factWeight.shift(self.hp + 1).dropna(how='all')

        weightDict = {}
        for sub in range(self.rp, factWeightNew.shape[0] + 1):
            print(dt.datetime.now(), sub)
            df_ = factWeightNew.iloc[sub - self.rp: sub, :]
            data_mean = np.array(df_.mean())

            if retType == 'IC':
                data_cov = np.array(fact.loc[df_.index].cov())
            else:
                data_cov = np.array(df_.cov())

            optParams = {
                "data_mean": data_mean,
                "data_cov": data_cov,
            }
            self.opt.set_params(**optParams)

            res_ = self.opt.solve()
            weightDict[df_.index[-1]] = res_.x

        w_df = pd.DataFrame(weightDict, index=fact.columns).T
        fact_comp = fact.mul(w_df, level=0).dropna(how='all').sum(axis=1).reindex(fact.index)
        return fact_comp
 def __portfolio_returns(w_shifted: pd.DataFrame,
                         r: pd.DataFrame) -> pd.DataFrame:
     return r.mul(w_shifted).sum(axis=1)
Пример #24
0
from pandas import DataFrame, Series
import pandas as pd

############################## basic functions #################################
# basic functions: get partly info
data[['Director','id','Gerne','Runtime']]
data.ix[['Crazy Asian','Movie 2'],['Director','Runtime']]

# basic functions:math oprations
df1 = DataFrame(np.arange(12.).reshape((3,4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape(4,5), columns=list('abcde'))
df1-df2 # auto matching, fill NaN
df1.add(df2, fill_value=0)
df1.sub(df2, fill_value=0)
df1.mul(df2, fill_value=0)
df1.div(df2, fill_value=0)

df1.reindex(columns=df2.columns, fill_value=0)

# function map
df1 = DataFrame(np.random.randn(4,3), columns=list('abc'), index=['id1','id2','id3','id4'])
np.abs(df1)

func1 = lambda x: x.max()-x.min()
df1.apply(f, axis=1)

def f(x):
    return ([x.max(),x.min()],index=['max','min'])
df1.apply(f)

format11 = lambda x: '%.2f' % x
Пример #25
0
    def factor_analysis(self,
                        industry_neutral=True,
                        size_neutral=True,
                        num_group=10):
        self.factor = self.inf_to_nan(self.factor)
        stocks = self.stocks
        start_date = self.start_date
        end_date = self.end_date
        y1 = pd.read_csv('%s/Data/y1.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y2 = pd.read_csv('%s/Data/y2.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y3 = pd.read_csv('%s/Data/y3.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y4 = pd.read_csv('%s/Data/y4.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]
        y5 = pd.read_csv('%s/Data/y5.csv' % gc.LABELBASE_PATH,
                         index_col=[0],
                         parse_dates=[0]).loc[:, stocks]

        if start_date:
            y1 = y1.loc[y1.index >= start_date, :]
            y2 = y2.loc[y2.index >= start_date, :]
            y3 = y3.loc[y3.index >= start_date, :]
            y4 = y4.loc[y4.index >= start_date, :]
            y5 = y5.loc[y5.index >= start_date, :]

        if end_date:
            y1 = y1.loc[y1.index <= end_date, :]
            y2 = y2.loc[y2.index <= end_date, :]
            y3 = y3.loc[y3.index <= end_date, :]
            y4 = y4.loc[y4.index <= end_date, :]
            y5 = y5.loc[y5.index <= end_date, :]

        self.y1 = y1
        self.y2 = y2
        self.y3 = y3
        self.y4 = y4
        self.y5 = y5

        if not os.path.exists(
                '%s/Results/%s/%s' %
            (gc.MULTIFACTOR_PATH, self.factor_name, self.method)):
            os.mkdir('%s/Results/%s/%s' %
                     (gc.MULTIFACTOR_PATH, self.factor_name, self.method))
        factor = self.factor.copy()

        #行业中性
        if industry_neutral:
            industrys = tools.get_industrys('L1', self.stocks)
            tmp = {}
            for k in industrys.keys():
                if len(industrys[k]) > 0:
                    tmp[k] = industrys[k]
            industrys = tmp
            factor = tools.standardize_industry(self.factor, industrys)
            self.factor_industry_neutral = factor.copy()

        #市值中性
        if size_neutral:
            market_capitalization = DataFrame({
                stock:
                pd.read_csv('%s/StockTradingDerivativeData/Stock/%s.csv' %
                            (gc.DATABASE_PATH, stock),
                            index_col=[0],
                            parse_dates=[0]).loc[:, 'TOTMKTCAP']
                for stock in self.stocks
            })
            market_capitalization = np.log(market_capitalization)
            if self.start_date:
                market_capitalization = market_capitalization.loc[
                    market_capitalization.index >= self.start_date, :]
            if self.end_date:
                market_capitalization = market_capitalization.loc[
                    market_capitalization.index <= self.end_date, :]
            if industry_neutral:
                market_capitalization = tools.standardize_industry(
                    market_capitalization, industrys)
            beta = (factor * market_capitalization).sum(1) / (
                market_capitalization * market_capitalization).sum(1)
            factor = factor - market_capitalization.mul(beta, axis=0)
            self.factor_industry_size_neutral = factor.copy()

        # self.factor_industry_neutral.fillna(0, inplace=True)
        # self.factor_industry_size_neutral.fillna(0, inplace=True)
        # factor.fillna(0, inplace=True)
        #因子分布
        plt.figure(figsize=(16, 12))
        plt.hist(factor.fillna(0).values.flatten())
        plt.savefig('%s/Results/%s/%s/hist.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name, self.method))

        #IC、IR、分组回测
        ys = [self.y1, self.y2, self.y3, self.y4, self.y5]
        IC = {}
        IR = {}
        group_backtest = {}
        group_pos = {}

        for i in range(len(ys)):
            if industry_neutral:
                y_neutral = tools.standardize_industry(ys[i], industrys)
            if size_neutral:
                y_neutral = y_neutral - market_capitalization.mul(
                    (y_neutral * market_capitalization).sum(1) /
                    (market_capitalization * market_capitalization).sum(1),
                    axis=0)
            IC[i] = (y_neutral *
                     factor).mean(1) / factor.std(1) / y_neutral.std(1)
            IR[i] = IC[i].rolling(20).mean() / IC[i].rolling(20).std()
            factor_quantile = DataFrame(
                rankdata(factor, axis=1),
                index=factor.index,
                columns=factor.columns).div(factor.notna().sum(1),
                                            axis=0)  # / len(factor.columns)
            factor_quantile[factor.isna()] = np.nan
            group_backtest[i] = {}
            group_pos[i] = {}
            for n in range(num_group):
                group_pos[i][n] = DataFrame((n / num_group <= factor_quantile)
                                            & (factor_quantile <=
                                               (n + 1) / num_group))
                group_pos[i][n][~group_pos[i][n]] = np.nan
                group_pos[i][n] = 1 * group_pos[i][n]
                group_backtest[i][n] = ((group_pos[i][n] * ys[i]).mean(1) -
                                        ys[i].mean(1)).cumsum().rename(
                                            '%s' % (n / num_group))
        self.IC = IC
        self.IR = IR
        self.group_pos = group_pos
        self.group_backtest = group_backtest

        plt.figure(figsize=(16, 12))
        for i in range(len(ys)):
            IC[i].cumsum().plot()
        plt.legend(['%s' % i for i in range(len(ys))])
        plt.savefig('%s/Results/%s/%s/IC.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name, self.method))

        plt.figure(figsize=(16, 12))
        for i in range(len(ys)):
            IR[i].cumsum().plot()
        plt.legend(['%s' % i for i in range(len(ys))])
        plt.savefig('%s/Results/%s/%s/IR.png' %
                    (gc.MULTIFACTOR_PATH, self.factor_name, self.method))

        for i in range(len(ys)):
            plt.figure(figsize=(16, 12))
            for n in range(num_group):
                group_backtest[i][n].plot()
            plt.legend(['%s' % i for i in range(num_group)])
            plt.savefig(
                '%s/Results/%s/%s/groupbacktest%s.png' %
                (gc.MULTIFACTOR_PATH, self.factor_name, self.method, i))
Пример #26
0
df1

df2 = DataFrame(np.arange(12).reshape(3,4),
                index=['2014','2015','2016'],
                columns = ['python','r','sql','plsql'])
df2

#
df1 + df2
df1.add(df2, fill_value=0)

#
df1.sub(df2, fill_value=0)

#
df1.mul(df2, fill_value=1)

#
df1.mul(df2, fill_value=1)


■ 브로드캐스팅(broadcasting): 하나의 값을 계속 중개해줌 
obj1 = np.arange(15).reshape(5,3)
obj1
obj2 = np.arange(3)
obj2


#브로드캐스팅: obj2 값은 [0,1,2]이므로 이 값이 계속 obj1의 다른 행에도 계속 돌아가면서 계산된다. 
#obj2가 obj1에 맞게 모양을 바꾼다. 
obj1 + obj2
Пример #27
0
def preprocess_res_sheet(
    res_sheet: pd.DataFrame,
    res_conversion_factors: pd.Series,
    template_index: pd.Index,
    years: List,
):

    # First column includes the index data
    sheet_index = pd.Index(res_sheet.iloc[:, 0])

    # Exclude empty prevailing columns
    res_sheet = res_sheet.iloc[:, 19 : 19 + len(years)]
    res_sheet.columns = years

    # Check for differences amongst indices
    deviations = sheet_index.difference(template_index)

    if deviations is not None:

        logging.getLogger().warning("\n\t{} WARNING {}".format("*" * 33, "*" * 33))

        deviation_idx = []
        for index in [
            "Verbrauch Sektor Energie: E5 & E7 (TJ)",
            "Verluste: E5 & E7 (TJ)",
            "Anrechenbare Erneuerbare (TJ)",
            "Anteil Erneuerbarer Energieträger insgesamt",
            "Anteil anrechenbare Erneuerbare Landwirtschaft",
            "Primärstrom Wasser real mit Pumpe (MWh)",
            "Primärstrom Wasser real ohne Pumpe (MWh)",
            "Umgebungswärme (anrechenbarer Anteil)",
        ]:

            if index in deviations:
                deviation_idx.append(index)
                deviations = deviations.drop([index])

        logging.getLogger().warning(
            r"\Deviating indices in res data:\n\t{}".format(deviation_idx)
        )

    # Set first column with indices as df index
    res_sheet.set_index(sheet_index, drop=True, inplace=True)

    # Drop rows that are not used
    res_sheet.drop(
        index=deviations, axis=0, inplace=True, errors="raise",
    )

    res_sheet = res_sheet.apply(pd.to_numeric, errors="coerce").round(2)

    # TODO: Replace with appropriate function
    # Turn all string values to NaN
    for i in res_sheet.index:
        for j in res_sheet.columns:
            if isinstance(res_sheet.loc[i, j], str):
                res_sheet.loc[i, j] = np.nan

    # Convert all values to TJ and MW
    res_sheet = res_sheet.mul(res_conversion_factors, axis=0)

    return res_sheet
Пример #28
0
print(data[:2])  #通过切片获取数据
print(data[data['a1'] > 4])  #通过bool索引获取数据
print(data['a1'])
print(data.ix['a', ['a1', 'b1']])  #获取第一个参数是行索引,第二个是列索引的范围
print(data.ix[data.a1 > 4, :3])
print(data.ix['a', 'a1'])
#算数运算和数据对齐
print('---------------------------------------')

d1 = DataFrame(np.arange(15).reshape(3, 5), index=list('abc'))
d2 = DataFrame(np.arange(20).reshape(4, 5), index=list('abcd'))
print(d1 + d2)  #相加会自动在不重叠的索引处引入NaN,也可以设置默认缺失值
print(d1.add(d2, fill_value=0))  #加
print(d1.sub(d2, fill_value=0))  #减
print(d1.div(d2, fill_value=0))  #除
print(d1.mul(d2, fill_value=0))  #乘
print('-----------------Dataframe与Series之间的运算----------------------')
#Dataframe与Series之间的运算
arr = DataFrame(
    np.arange(15).reshape(5, 3),
    columns=list('abc'),
    index=['ShangHai', 'BeiJing', 'Changsha', 'Hangzhou', 'Fujian'])
series = arr.ix[0]  #获取第一行数据
print(arr)
print(series)

print(arr - series)  #默认情况下,DF和Series之间的算术运算会将Series的索引匹配到DF的列,然后向下广播
print(arr.sub(arr['a'], axis=0))  #在列上进行广播
print('-----------------Dataframe函数应用和映射----------------------')
frame = DataFrame(np.random.randn(4, 3),
                  columns=list('bde'),
Пример #29
0
    def test_binary_ops_align(self):

        # test aligning binary ops

        # GH 6681
        index = MultiIndex.from_product(
            [list("abc"), ["one", "two", "three"], [1, 2, 3]],
            names=["first", "second", "third"],
        )

        df = DataFrame(
            np.arange(27 * 3).reshape(27, 3),
            index=index,
            columns=["value1", "value2", "value3"],
        ).sort_index()

        idx = pd.IndexSlice
        for op in ["add", "sub", "mul", "div", "truediv"]:
            opa = getattr(operator, op, None)
            if opa is None:
                continue

            x = Series([1.0, 10.0, 100.0], [1, 2, 3])
            result = getattr(df, op)(x, level="third", axis=0)

            expected = pd.concat([
                opa(df.loc[idx[:, :, i], :], v) for i, v in x.items()
            ]).sort_index()
            tm.assert_frame_equal(result, expected)

            x = Series([1.0, 10.0], ["two", "three"])
            result = getattr(df, op)(x, level="second", axis=0)

            expected = (pd.concat([
                opa(df.loc[idx[:, i], :], v) for i, v in x.items()
            ]).reindex_like(df).sort_index())
            tm.assert_frame_equal(result, expected)

        # GH9463 (alignment level of dataframe with series)

        midx = MultiIndex.from_product([["A", "B"], ["a", "b"]])
        df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx)
        s = pd.Series({"a": 1, "b": 2})

        df2 = df.copy()
        df2.columns.names = ["lvl0", "lvl1"]
        s2 = s.copy()
        s2.index.name = "lvl1"

        # different cases of integer/string level names:
        res1 = df.mul(s, axis=1, level=1)
        res2 = df.mul(s2, axis=1, level=1)
        res3 = df2.mul(s, axis=1, level=1)
        res4 = df2.mul(s2, axis=1, level=1)
        res5 = df2.mul(s, axis=1, level="lvl1")
        res6 = df2.mul(s2, axis=1, level="lvl1")

        exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"),
                        columns=midx)

        for res in [res1, res2]:
            tm.assert_frame_equal(res, exp)

        exp.columns.names = ["lvl0", "lvl1"]
        for res in [res3, res4, res5, res6]:
            tm.assert_frame_equal(res, exp)
Пример #30
0
import numpy as np
from pandas import DataFrame

npdata = np.random.randn(5, 3)
columnNames = ['x1', 'x2', 'x3']
data = DataFrame(npdata, columns=columnNames)

print('data =')
print(data)

columnNames = ['x1', 'x2', 'x3']
data2 = DataFrame(np.random.randn(5, 3), columns=columnNames)
print('\ndata2 =')
print(data2)

print('\ndata + data2 = ')
print(data.add(data2))

print('\ndata * data2 = ')
print(data.mul(data2))
Пример #31
0
def de_zscore_to_val(df_zscore: pd.DataFrame, df_mean_base: pd.DataFrame, series_std: pd.Series) -> pd.DataFrame:
    df = df_zscore.mul(series_std, axis=1)
    df = df.add(df_mean_base.to_numpy()[0],axis=1)
    return df
    def test_binary_ops_align(self):

        # test aligning binary ops

        # GH 6681
        index = MultiIndex.from_product(
            [list('abc'), ['one', 'two', 'three'], [1, 2, 3]],
            names=['first', 'second', 'third'])

        df = DataFrame(np.arange(27 * 3).reshape(27, 3),
                       index=index,
                       columns=['value1', 'value2', 'value3']).sort_index()

        idx = pd.IndexSlice
        for op in ['add', 'sub', 'mul', 'div', 'truediv']:
            opa = getattr(operator, op, None)
            if opa is None:
                continue

            x = Series([1.0, 10.0, 100.0], [1, 2, 3])
            result = getattr(df, op)(x, level='third', axis=0)

            expected = pd.concat([
                opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems()
            ]).sort_index()
            assert_frame_equal(result, expected)

            x = Series([1.0, 10.0], ['two', 'three'])
            result = getattr(df, op)(x, level='second', axis=0)

            expected = (pd.concat([
                opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems()
            ]).reindex_like(df).sort_index())
            assert_frame_equal(result, expected)

        # GH9463 (alignment level of dataframe with series)

        midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']])
        df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx)
        s = pd.Series({'a': 1, 'b': 2})

        df2 = df.copy()
        df2.columns.names = ['lvl0', 'lvl1']
        s2 = s.copy()
        s2.index.name = 'lvl1'

        # different cases of integer/string level names:
        res1 = df.mul(s, axis=1, level=1)
        res2 = df.mul(s2, axis=1, level=1)
        res3 = df2.mul(s, axis=1, level=1)
        res4 = df2.mul(s2, axis=1, level=1)
        res5 = df2.mul(s, axis=1, level='lvl1')
        res6 = df2.mul(s2, axis=1, level='lvl1')

        exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'),
                        columns=midx)

        for res in [res1, res2]:
            assert_frame_equal(res, exp)

        exp.columns.names = ['lvl0', 'lvl1']
        for res in [res3, res4, res5, res6]:
            assert_frame_equal(res, exp)
Пример #33
0
s1 = Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = Series([4, 5, 6, 7], index=['a', 'b', 'd', 'c'])
print(s1)
print(s2)
print(s1 + s2)
print(s1.add(s2))  # 같은 인덱스명이 대응될 때 연산 가능(인덱스가 같아야 한다)

print()
df1 = DataFrame(np.arange(9.).reshape(3, 3),
                columns=list('kbs'),
                index=['서울', '인천', '수원'])
print(df1)
df2 = DataFrame(np.arange(12.).reshape(4, 3),
                columns=list('kbs'),
                index=['서울', '인천', '일산', '수원'])
print(df2)

print()
print(df1 + df2)  # 얘는 속성을 쓸 수 없다.
print(df1.add(df2))  # 얘는 속성을 쓸 수 있다.
print(df1.add(df2, fill_value=0))  # 얘는 속성(ex. fill_value)를 쓸 수 있다.

print()
print(df1 + df2)
print(df1.mul(df2))  # mul : 곱하기
print(df1.mul(df2, fill_value=0))

print()
seri = df1.iloc[0]  # df1의 1열 모두 출력
print(seri)
print(df1 - seri)  # broadcasting되서 연산 가능
Пример #34
0
    def test_binary_ops_align(self):

        # test aligning binary ops

        # GH 6681
        index = MultiIndex.from_product([list('abc'),
                                         ['one', 'two', 'three'],
                                         [1, 2, 3]],
                                        names=['first', 'second', 'third'])

        df = DataFrame(np.arange(27 * 3).reshape(27, 3),
                       index=index,
                       columns=['value1', 'value2', 'value3']).sort_index()

        idx = pd.IndexSlice
        for op in ['add', 'sub', 'mul', 'div', 'truediv']:
            opa = getattr(operator, op, None)
            if opa is None:
                continue

            x = Series([1.0, 10.0, 100.0], [1, 2, 3])
            result = getattr(df, op)(x, level='third', axis=0)

            expected = pd.concat([opa(df.loc[idx[:, :, i], :], v)
                                  for i, v in x.iteritems()]).sort_index()
            assert_frame_equal(result, expected)

            x = Series([1.0, 10.0], ['two', 'three'])
            result = getattr(df, op)(x, level='second', axis=0)

            expected = (pd.concat([opa(df.loc[idx[:, i], :], v)
                                   for i, v in x.iteritems()])
                        .reindex_like(df).sort_index())
            assert_frame_equal(result, expected)

        # GH9463 (alignment level of dataframe with series)

        midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']])
        df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx)
        s = pd.Series({'a': 1, 'b': 2})

        df2 = df.copy()
        df2.columns.names = ['lvl0', 'lvl1']
        s2 = s.copy()
        s2.index.name = 'lvl1'

        # different cases of integer/string level names:
        res1 = df.mul(s, axis=1, level=1)
        res2 = df.mul(s2, axis=1, level=1)
        res3 = df2.mul(s, axis=1, level=1)
        res4 = df2.mul(s2, axis=1, level=1)
        res5 = df2.mul(s, axis=1, level='lvl1')
        res6 = df2.mul(s2, axis=1, level='lvl1')

        exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'),
                        columns=midx)

        for res in [res1, res2]:
            assert_frame_equal(res, exp)

        exp.columns.names = ['lvl0', 'lvl1']
        for res in [res3, res4, res5, res6]:
            assert_frame_equal(res, exp)
Пример #35
0
print(frame2)

# frame 덧셈
add = frame1.add(frame2)
print(add)

# frame 뺄셈
sub = frame2.sub(frame1)
print(sub)

# frame 나눗셈 div = frame2 / frame1
div = frame2.div(frame1)
print(div) # inf : 부모가 0인 경우 

# frame 곱셈 
mul = frame1.mul(frame2)
print(mul)

# 행/열 단위 합계/평균/최댓값/최솟값

sum1 = mul.sum(axis = 1) # 행 단위
sum2 = mul.sum(axis = 0) # 열 단위
print('행 단위 합계:\n',sum1)
print('열 단위 합계:\n',sum2)


avg1 = mul.mean(axis = 1) # 행 단위 평균
avg2 = mul.mean(axis = 0) # 열 단위 평균
print('행 단위 평균:\n',avg1)
print('열 단위 평균:\n',avg2)