Пример #1
0
def cov_section(pair_cols: pd.DataFrame, mu_star: pd.DataFrame) -> pd.Series:
    """Returns the sectional covariogram of the pairs of function evaluations
    that resulted from each star point. This function is specific for the time-series
    varying/aggregate of the VARS sensitivity analysis.

    Parameters
    ----------
    pair_cols : array_like
        a Pandas Dataframe of paired values function evaluations
    mu_star : array_like
        a Pandas DataFrame of mu star values that are calculated separately

    Returns
    -------
    cov_section_values : array_like
        the sectional covariogram dataframe

    References
    ----------
    .. [1] Razavi, S., & Gupta, H. V. (2016). A new framework for comprehensive, 
           robust, and efficient global sensitivity analysis: 1. Theory. Water 
           Resources Research, 52(1), 423-439. doi: 10.1002/2015WR017558

    .. [2] Razavi, S., & Gupta, H. V. (2016). A new framework for comprehensive, 
           robust, and efficient global sensitivity analysis: 1. Application. Water 
           Resources Research, 52(1), 423-439. doi: 10.1002/2015WR017559

    """

    cov_section_values = (pair_cols.sub(mu_star, axis=0)[0] * pair_cols.sub(mu_star, axis=0)[1]).\
        groupby(level=['ts', 'centre', 'param', 'h']).mean()

    return cov_section_values
Пример #2
0
def pd_02():
    frame=DataFrame(np.arange(12.).reshape(4,3),columns=list('bde'),index=['Ohio', 'Colorado', 'Utah', 'New York'])
    print frame
    series=frame.ix[0]
    print series
    print frame-series
    series2=Series(range(3),index=['b','e','f'])
    print frame+series2
    series3=frame['d']
    print series3
    print frame.sub(series3,axis=0)
Пример #3
0
    def transform(self, time_series: pd.DataFrame) -> pd.DataFrame:
        """Transform the ``time_series`` by removing the trend.

        Parameters
        ----------
        time_series: pd.DataFrame, shape (n_samples, 1), required
            The time series to transform.

        Returns
        -------
        time_series_t : pd.DataFrame, shape (n_samples, n_features)
            The transformed time series, without the trend.

        """
        check_is_fitted(self)

        time_steps = (time_series.index - self.t0_) / self.period_

        predictions = pd.Series(
            index=time_series.index,
            data=np.array([
                TRENDS[self.trend](t, self.best_trend_params_)
                for t in time_steps
            ]).flatten(),
        )

        return time_series.sub(predictions, axis=0)
Пример #4
0
def crdDiff(dMarker: dict, dfUTMh: pd.DataFrame, plotCrds: list, logger: logging.Logger) -> Tuple[pd.DataFrame, dict]:
    """
    calculates the differences of UTM,ellH using reference position or mean position
    """
    cFuncName = colored(os.path.basename(__file__), 'yellow') + ' - ' + colored(sys._getframe().f_code.co_name, 'green')

    # determine the difference to weighted average or marker position of UTM (N,E), ellH to plot
    dfCrd = pd.DataFrame(columns=plotCrds)

    # determine the coordinates of used reference (either mean or user determined)
    if [dMarker['UTM.E'], dMarker['UTM.N'], dMarker['ellH']] == [np.NaN, np.NaN, np.NaN]:
        # so no reference position given use mean position
        originCrds = [float(amc.dRTK['WAvg'][crd]) for crd in plotCrds]
    else:
        # make difference to reference position
        originCrds = [float(amc.dRTK['marker'][crd]) for crd in plotCrds]

    # subtract origin coordinates from UTMh positions
    dfCrd = dfUTMh.sub(originCrds, axis='columns')

    amutils.logHeadTailDataFrame(logger=logger, callerName=cFuncName, df=dfCrd, dfName='dfCrd')

    crdMax = max(dfCrd.max())
    crdMin = min(dfCrd.min())
    crdMax = int(crdMax + (1 if crdMax > 0 else -1))
    crdMin = int(crdMin + (1 if crdMin > 0 else -1))

    dCrdLim = {'max': crdMax, 'min': crdMin}

    return dfCrd, dCrdLim
Пример #5
0
def diff(df: pd.DataFrame,
         subtrahend: str,
         drop: bool = False) -> Union[pd.DataFrame, list]:
    """
    Verinin subtrahend ile belirtilen sütundan farkını alın.

    Args:
        df : Dataframe or List
        subtrahend : Çıkarılacak sütun adı. Ortalama için 'mean' ayarlanmalı.
        drop: subtrahend ile belirtilen sütun dataframe'den çıkarılsın mı?

    Returns:
        Union[pd.DataFrame, list]
    """

    if isinstance(df, list):
        diff_list = []
        for _df in df:
            if subtrahend == 'mean':
                diff_value = _df.mean(axis=1)
            else:
                diff_value = _df[subtrahend]
            diff = _df.sub(diff_value, axis=0)
            if drop:
                diff = diff.drop(subtrahend, axis=1)
            diff_list.append(diff)
        return diff_list
    if subtrahend == 'mean':
        diff_value = df.mean(axis=1)
    else:
        diff_value = df[subtrahend]
    diff = df.sub(diff_value, axis=0)
    if drop:
        diff = diff.drop(subtrahend, axis=1)
    return diff
Пример #6
0
    def test_column_dups_indexing(self):

        # dup aligning operations should work
        # GH 5185
        df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
        df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
        expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
        result = df1.sub(df2)
        tm.assert_frame_equal(result, expected)
Пример #7
0
    def mean_centered(self, utility_matrix: pd.DataFrame) -> pd.DataFrame:
        """

        :param utility_matrix:
        :return:
        """
        mean_centered_utility_matrix = utility_matrix.sub(
            utility_matrix.mean())

        return self.data.similarity_matrix_cosine(mean_centered_utility_matrix)
Пример #8
0
    def distances_euclid(data: pd.DataFrame, target_index: int) -> pd.Series:
        # calculate "euclidean" distances: sqrt(sum((actual_rating - target_rating)^2))

        # print(data)
        tmp = data.sub(data.loc[target_index], axis='columns')
        # print(tmp)
        tmp = tmp ** 2
        # print(tmp)
        tmp = tmp.sum(axis='columns')
        # print(tmp)
        tmp = np.sqrt(tmp)
        # print(tmp)
        return tmp
Пример #9
0
    def _calc_assets_returns(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        """
        Calculate dataframe of assets returns

        Parameters:
        dataframe: current dataframe

        Returns:
        Dataframe with assets returns

        """

        shifted_df = dataframe.shift(1)
        return dataframe.sub(shifted_df).div(shifted_df)
Пример #10
0
    def transform(self, time_series: pd.DataFrame) -> pd.DataFrame:
        """Transform the ``time_series`` by removing the trend.

        Parameters
        ----------
        time_series : ``pd.DataFrame``, required.
            The time series to transform.

        Returns
        -------
        transformed_time_series : ``pd.DataFrame``
            The transformed time series, without the trend.

        """
        ts = (time_series.index - self.t0_) / self.period_

        predictions = pd.Series(
            index=time_series.index,
            data=[np.exp(t * self.model_exponent_) for t in ts],
        )

        return time_series.sub(predictions, axis=0)
Пример #11
0
    def transform(self, ts: pd.DataFrame) -> pd.DataFrame:
        """Transform the ``time_series`` by removing the trend.

        Parameters
        ----------
        ts: pd.DataFrame, shape (n_samples, 1), required
            The time series to transform.

        Returns
        -------
        ts_t : pd.DataFrame, shape (n_samples, n_features)
            The transformed time series, without the trend.

        """
        check_is_fitted(self)

        p = np.poly1d(self.model_weights_)
        time_steps = (ts.index - self.t0_) / self.period_

        predictions = pd.Series(index=ts.index,
                                data=[p(t) for t in time_steps])

        return ts.sub(predictions, axis=0)
Пример #12
0
def val_convert_to_zscore(df: pd.DataFrame, mean_base_date: Optional[date],
                          calc_period: Tuple[Optional[date], Optional[date]],
                          output_std_and_mean: bool = False) -> pd.DataFrame:
    """计算 calc_period 时间跨度内,以 mean_base_date 数据为均值计算 zscore 的值
        这里假定 mean_base_date 一定在 calc_period 的时间区段内

        NOTE:mean_base_date 值为 None 的时候,计算 mean 值,而不是以某一期的数据作为 mean
    """
    start, end = calc_period
    df = _filter_df_by_start_end(df, start, end)
    # 减去某一期的固定值
    if mean_base_date is None:
        series_mean = df.agg("mean", axis=0)
    else:
        series_mean = df[df.index == datetime.combine(mean_base_date, datetime.min.time())].iloc[0]

    df_delta = df.sub(series_mean, axis=1)
    df_std = df.agg("std", axis=0)
    df_zscore = df_delta.div(df_std, axis=1)
    if output_std_and_mean:
        return df_zscore, series_mean, df_std
    else:
        return df_zscore
    def test_column_dups_indexing(self):
        def check(result, expected=None):
            if expected is not None:
                tm.assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # boolean indexing
        # GH 4879
        dups = ["A", "A", "C", "D"]
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=["A", "B", "C", "D"],
                       dtype="float64")
        expected = df[df.C > 6]
        expected.columns = dups
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=dups,
                       dtype="float64")
        result = df[df.C > 6]
        check(result, expected)

        # where
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=["A", "B", "C", "D"],
                       dtype="float64")
        expected = df[df > 6]
        expected.columns = dups
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=dups,
                       dtype="float64")
        result = df[df > 6]
        check(result, expected)

        # boolean with the duplicate raises
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=dups,
                       dtype="float64")
        msg = "cannot reindex from a duplicate axis"
        with pytest.raises(ValueError, match=msg):
            df[df.A > 6]

        # dup aligning operations should work
        # GH 5185
        df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
        df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
        expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
        result = df1.sub(df2)
        tm.assert_frame_equal(result, expected)

        # equality
        df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]],
                        columns=["A", "B"])
        df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]],
                        columns=["A", "A"])

        # not-comparing like-labelled
        msg = "Can only compare identically-labeled DataFrame objects"
        with pytest.raises(ValueError, match=msg):
            df1 == df2

        df1r = df1.reindex_like(df2)
        result = df1r == df2
        expected = DataFrame(
            [[False, True], [True, False], [False, False], [True, False]],
            columns=["A", "A"],
        )
        tm.assert_frame_equal(result, expected)

        # mixed column selection
        # GH 5639
        dfbool = DataFrame({
            "one":
            Series([True, True, False], index=["a", "b", "c"]),
            "two":
            Series([False, False, True, False], index=["a", "b", "c", "d"]),
            "three":
            Series([False, True, True, True], index=["a", "b", "c", "d"]),
        })
        expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]],
                             axis=1)
        result = dfbool[["one", "three", "one"]]
        check(result, expected)

        # multi-axis dups
        # GH 6121
        df = DataFrame(
            np.arange(25.0).reshape(5, 5),
            index=["a", "b", "c", "d", "e"],
            columns=["A", "B", "C", "D", "E"],
        )
        z = df[["A", "C", "A"]].copy()
        expected = z.loc[["a", "c", "a"]]

        df = DataFrame(
            np.arange(25.0).reshape(5, 5),
            index=["a", "b", "c", "d", "e"],
            columns=["A", "B", "C", "D", "E"],
        )
        z = df[["A", "C", "A"]]
        result = z.loc[["a", "c", "a"]]
        check(result, expected)
Пример #14
0
print(df1)
print(df2)
print(df1.add(df2, fill_value=0))
print(df1.reindex(columns=df2.columns, fill_value=0))
print()

print("## Operate between Series and DataFrame:")
arr = np.arange(12.).reshape((3, 4))
print(arr)
print(arr[0])
print(arr - arr[0])
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
print("frame:")
print(frame)
print("series:")
print(series)
print(type(frame))
print(type(series))
print(frame - series)
series2 = Series(range(3), index=list('bef'))
print("series2:")
print(series2)
print(frame + series2)
series3 = frame['d']
print("series3:")
print(series3)
print(frame.sub(series3, axis=0))  # substract by column(along dim 0)
Пример #15
0
        'large_high': Large_high.values,
        'small_high': Small_high.values
    },
    index=dates[1:])
cum_returns.head()
"""## Cumulative Return Plot of All Strategies"""

plt.figure(figsize=(20, 5))
plt.plot(cum_returns)
plt.legend(list(cum_returns.columns), loc='upper left')
plt.show()
"""## 5. Sharpe Ratio"""

risk_free = Series(d3.RF[1:].values, index=dates[1:]) / 100
summary = pd.DataFrame()
summary['excess_ret_total'] = returns.sub(
    risk_free, axis=0).add(1).resample('A').agg('prod').sub(1).mean()
summary['vol_total'] = returns.apply(np.std) * 12**0.5
summary['Sharpe_total'] = summary.excess_ret_total / summary.vol_total
summary
##sharpe ratio before 2009
summary_1 = pd.DataFrame()
summary_1['excess_ret_before'] = returns.sub(
    risk_free,
    axis=0).add(1).resample('A').agg('prod').sub(1)[:'2009-01-01'].mean()
summary_1['vol_before'] = returns[:'2009-01-01'].apply(np.std) * 12**0.5
summary_1['Sharpe_before'] = summary_1.excess_ret_before / summary_1.vol_before
summary_1
##sharpe ratio after 2009
summary_2 = pd.DataFrame()
summary_2['excess_ret_after'] = returns.sub(
    risk_free,
Пример #16
0
frame
series

frame - series

series2 = Series(range(3), index=['b', 'e', 'f'])

frame + series2

series3 = frame['d']

frame
series3

frame.sub(series3, axis=0)

# function application and mapping

frame = DataFrame(np.random.randn(4, 3),
                  columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])

frame
np.abs(frame)

f = lambda x: x.max() - x.min()
frame.apply(f)
frame.apply(f, axis=1)

### NB: a bunch are built in: eg: sum and mean
Пример #17
0
print frame
#         b   d   e
# Utah    0   1   2
# Ohio    3   4   5
# Texas   6   7   8
# Oregon  9  10  11
series3 = frame['d']
print series3
# Utah       1
# Ohio       4
# Texas      7
# Oregon    10
# Name: d

print frame.sub(series3, axis=0)
#         b  d  e
# Utah   -1  0  1
# Ohio   -1  0  1
# Texas  -1  0  1
# Oregon -1  0  1

print '######################################################'

frame = DataFrame(np.random.randn(4, 3),
                  columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])

print frame
#                b         d         e
# Utah   -0.776852  0.976385  0.153123
Пример #18
0
df1 = DataFrame(np.arange(9).reshape((3,3)), columns=list('bcd'), index=['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12).reshape((4,3)),columns=list('bde'), index=['Utah','Ohio','Texas','Oregon'])
df1 + df2
# 所谓的对齐,就是索引相同的值做运算
# 填充值,可以给对不上的对象填充一个特殊值
df1.add(df2, fill_value=0)
# 这个只会填充df2中没有的对象。

## DataFrame和Series之间的运算
frame = DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=['Utah','Utahs','Texas','Oregon'])
series = frame.ix[0]
frame - series
# 每列都会减对应元素,这种被称为沿着行广播,如果想要沿着列广播,可以如下操作
series3 = frame['d']
series3 = frame.ix['d',:]
frame.sub(series3, axis=0)

## 函数的应用和映射
frame = DataFrame(np.random.randn(4,3), columns=list('bde'),
	index=['Utah','Ohio','Texas','Oregon'])
f = lambda x :x.max()-x.min()
# 每列使用函数f
frame.apply(f)
# 每行使用函数f
frame.apply(f, axis=1)
# 返回多个值的函数
def f(x):
	return Series([x.min(), x.max()], index=['min','max'])
frame.apply(f)
# python的元素级的函数
format = lambda x: '%.2f' % x
Utah    0 NaN   3 NaN
Ohio    3 NaN   6 NaN
Texas   6 NaN   9 NaN
Oregon  9 NaN  12 NaN

[4 rows x 4 columns]

'''


'''
If you want to instead broadcast over the columns, matching on the rows, you have to
use one of the arithmetic methods. For example:
'''
series3 = frame['d']
print(frame.sub(series3, axis = 0))
'''
        b  d  e
Utah   -1  0  1
Ohio   -1  0  1
Texas  -1  0  1
Oregon -1  0  1

[4 rows x 3 columns]
'''



#####################################################
# Function application and mapping
############################################################
Пример #20
0
    def test_column_dups_indexing(self):

        # dup aligning operations should work
        # GH 5185
        df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
        df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
        expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
        result = df1.sub(df2)
        tm.assert_frame_equal(result, expected)

        # equality
        df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]],
                        columns=["A", "B"])
        df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]],
                        columns=["A", "A"])

        # not-comparing like-labelled
        msg = "Can only compare identically-labeled DataFrame objects"
        with pytest.raises(ValueError, match=msg):
            df1 == df2

        df1r = df1.reindex_like(df2)
        result = df1r == df2
        expected = DataFrame(
            [[False, True], [True, False], [False, False], [True, False]],
            columns=["A", "A"],
        )
        tm.assert_frame_equal(result, expected)

        # mixed column selection
        # GH 5639
        dfbool = DataFrame({
            "one":
            Series([True, True, False], index=["a", "b", "c"]),
            "two":
            Series([False, False, True, False], index=["a", "b", "c", "d"]),
            "three":
            Series([False, True, True, True], index=["a", "b", "c", "d"]),
        })
        expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]],
                             axis=1)
        result = dfbool[["one", "three", "one"]]
        check(result, expected)

        # multi-axis dups
        # GH 6121
        df = DataFrame(
            np.arange(25.0).reshape(5, 5),
            index=["a", "b", "c", "d", "e"],
            columns=["A", "B", "C", "D", "E"],
        )
        z = df[["A", "C", "A"]].copy()
        expected = z.loc[["a", "c", "a"]]

        df = DataFrame(
            np.arange(25.0).reshape(5, 5),
            index=["a", "b", "c", "d", "e"],
            columns=["A", "B", "C", "D", "E"],
        )
        z = df[["A", "C", "A"]]
        result = z.loc[["a", "c", "a"]]
        check(result, expected)
Пример #21
0
arr[0]
arr - arr[0]
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
series = frame.iloc[0]
series
frame-series
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame+series2
frame

series3 = frame['d']
series3
frame.sub(series3, axis='index')

frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
np.abs(frame)
f = lambda x: x.max()-x.min()
frame.apply(f)
frame.apply(f, axis='columns')
frame.apply(f, axis=1)
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)
format = lambda x: '%.2f' % x
frame.applymap(format) #applymap is element wise
frame
Пример #22
0
series2 = Series(range(3), index=['b', 'e', 'f'])
frame3 + series2

# In[100]:

series3 = frame3['d']
frame3

# In[102]:

series3

# In[106]:

frame4 = frame3.sub(series3, axis=0)
frame4

# In[107]:

np.abs(frame4)

# In[112]:

print(frame3, '\n----------')
f = lambda x: x.max() - x.min()
frame3.apply(f)

# In[110]:

Пример #23
0
def main():
    # reindex
    obj = Series(range(4), index="a b c d".split(" ")[::-1])
    print obj

    obj2 = obj.reindex("a b c d e".split(" "))
    print obj2

    # Change NaN
    print obj.reindex("a b c d e".split(" "), fill_value=0)
    colors = ["blue", "purple", "yellow"]
    index = [0, 2, 4]
    obj3 = Series(colors, index=index)
    print obj3.reindex(range(6))
    print obj3.reindex(range(6), method="ffill")  # not found forward fill
    print obj3.reindex(range(6), method="backfill")  # bfill

    # DataFrame
    states = ["Ohio", "Texas", "California"]
    frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"])
    print frame
    frame2 = frame.reindex("a b c d".split(" "))
    print frame2
    states[0] = "Utah"
    states[1], states[0] = states[:2]
    print frame.reindex(columns=states)
    # fill
    print frame.reindex("a b c d".split(" "), method="ffill", columns=states)
    print frame.ix["a b c d".split(" ")]
    print frame.ix["a b c d".split(" "), states]

    # Delete column
    print "", ""
    obj = Series(range(5), index="a b c d e".split(" "))
    new_obj = obj.drop("c")
    print new_obj
    print obj

    # Index reference
    print "", ""
    obj = Series(np.arange(4.0), index="a b c d".split(" "))
    print obj["b"]
    print obj[1]  # same
    print obj[2:4]
    print obj[["b", "a", "c"]]
    print obj[[1, 3]]
    print obj[obj < 2]
    # Slice with label
    print obj["b":"c"]  # include 'c'
    obj["b":"c"] = 5
    print obj

    data = DataFrame(
        np.arange(16).reshape((4, 4)),
        index=["Ohio", "Colorado", "Utah", "New York"],
        columns=["one", "two", "three", "four"],
    )
    print data
    # column
    print data["two"]
    print data[["three", "one"]]
    # row
    print data[:2]
    print data[data["three"] > 5]
    # all values
    print data < 5
    data[data < 5] = 0
    print data
    # row and column
    print data.ix[["Colorado"], ["two", "three"]]
    print data.ix[["Colorado", "Utah"], [3, 0, 1]]
    # row
    print data.ix[2]
    # label row and column, return column
    print data.ix[:"Utah", "two"]
    # xs
    # row
    print data.xs("Utah")
    print data.xs("Utah", axis=0)
    # rows
    print data.xs("two", axis=1)
    # icol/irow i is index
    print data.icol(1)
    print data.irow(1)

    # Union
    print "", ""
    s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
    s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
    print s1
    print s2
    # index is union, but d, f, g are NaN
    print s1 + s2
    df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"])
    df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print df1
    print df2
    print df1 + df2

    # arithmetic method
    print "", ""
    df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd"))
    df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde"))
    print df1
    print df2
    print df1.add(df2, fill_value=0)
    # reindex has fill_value argument
    # other arithmetic method are sub/div/mul(ti)

    # Calculation in a DataFrame and Series
    print "", ""
    # subtract from each row. broadcat
    arr = np.arange(12.0).reshape((3, 4))
    print arr
    print arr[0]
    print arr - arr[0]
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    series = frame.ix[0]
    print frame
    print series
    print frame - series

    series2 = Series(range(3), index=list("bef"))
    print frame + series2

    series3 = frame["d"]
    series4 = frame.ix[0]
    print frame
    print series3
    print series4
    print frame.sub(series3, axis=0)
    print frame.sub(series4, axis=1)

    # apply function and mapping
    print "", ""
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print frame
    f = lambda x: x.max() - x.min()
    print frame.apply(f)
    print frame.apply(f, axis=1)

    f = lambda x: Series([x.min(), x.max()], index=["min", "max"])
    print frame.apply(f)

    format = lambda x: "{0:.2f}".format(x)
    print frame.applymap(format)  # frame
    print frame["e"].map(format)  # series

    # sort and rank
    print "", ""
    obj = Series(range(4), index=list("dabc"))
    print obj
    print obj.sort_index()

    frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
    print frame
    print frame.sort_index()
    print frame.sort_index(axis=1)
    print frame.sort_index(axis=1, ascending=False)

    # Sorting series
    print "", ""
    obj = Series([4, 7, -3, 2])
    print obj.order()
    obj = Series([4, np.nan, 7, np.nan, -3, 2])
    print obj.order()
    print obj.order(ascending=False)

    # order by multi columns
    print "", ""
    frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
    print frame.sort_index(by=["a", "b"])

    # rank
    print "", ""
    obj = Series([7, -5, 7, 4, 2, 0, 4])
    print obj.rank()  # method is average
    print obj.rank(method="first")  # No Duplicates
    print obj.rank(ascending=False, method="min")
    print obj.rank(ascending=False, method="max")
    f1 = DataFrame(obj, columns=["data"])
    f2 = DataFrame(obj.rank(), columns=["rank"])
    # merge by each index
    print pd.merge(f1, f2, left_index=True, right_index=True)

    # Index of the axis with duplicate values
    print "", ""
    obj = Series(range(5), index=list("aaabc"))
    print obj
    print obj.index.is_unique
    print obj["a"]
    print obj["c"]

    df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd"))
    print df
    print df.ix["b"]
    print df["c"]
Пример #24
0
'''
   A  B   C   D
a  0  2   4 NaN
b  3  5   7 NaN
c  6  8  10 NaN
'''
series3 = frame.A
print
series3
'''
a    0
b    3
c    6
'''
print
frame.sub(series3, axis=0)  # 按列运算
'''
   A  B  C
a  0  1  2
b  0  1  2
c  0  1  2
'''

print
'numpy函数在Series/DataFrame的应用'
frame = DataFrame(numpy.arange(9).reshape(3, 3),
                  columns=['A', 'B', 'C'],
                  index=['a', 'b', 'c'])
print
frame
'''
Пример #25
0
frame+se2


# In[151]:

se3=frame['b']


# In[152]:

se3


# In[153]:

frame.sub(se3,axis=0)


# ## 7,函数应用和映射 

# In[154]:

frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=('Utho','Ohio','Ehir','Hude'))


# In[155]:

frame


# In[156]:
Пример #26
0
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill', columns=states)

# Arithmetic and data alignment
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

df1+df2
df1.add(df2, fill_value=0)

# Operations between DataFrame and Series
frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
frame - series #boardcast on each row
series2 = frame['d']
frame.sub(series2, axis=0)  #boardcast on each column

#Function application and mapping------------------------
# numpy的ufunc会被应用到元素级
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),index=['Utah', 'Ohio', 'Texas', 'Oregon'])
np.abs(frame)
frame.abs()
# DataFrame的apply默认将函数应用在各列
f = lambda x: x.max() - x.min() #x is an array?
frame.apply(f)
frame.apply(f,axis=1)   #应用于各行

def f(x):
    return Series([x.min(), x.max()], index = ['min', 'max'])
frame.apply(f)
#元素级的python函数应该用applymap,Series用map
Пример #27
0
import numpy as np

# DataFrame 생성 
frame1 = DataFrame(np.arange(0,9).reshape(3,3),
                   columns=list('abc'))
frame2 = DataFrame(np.arange(1,10).reshape(3,3),
                   columns=list('abc'))
print(frame1)
print(frame2)

# frame 덧셈
add = frame1.add(frame2)
print(add)

# frame 뺄셈
sub = frame2.sub(frame1)
print(sub)

# frame 나눗셈 div = frame2 / frame1
div = frame2.div(frame1)
print(div) # inf : 부모가 0인 경우 

# frame 곱셈 
mul = frame1.mul(frame2)
print(mul)

# 행/열 단위 합계/평균/최댓값/최솟값

sum1 = mul.sum(axis = 1) # 행 단위
sum2 = mul.sum(axis = 0) # 열 단위
print('행 단위 합계:\n',sum1)
Пример #28
0
series = frame.ix[0]
print(frame)
print(series)

print(frame - series)

series2 = Series(range(3), index=['b', 'e', 'f'])

print(frame + series2)
#Только пересечение

series3 = frame['d']
print(frame)
print(series3)

print(frame.sub(series3, axis=0))

#Function application and mapping

frame = DataFrame(np.random.randn(4, 3),
                  columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print(np.abs(frame))

f = lambda x: x.max() - x.min()

print(frame.apply(f))
print(frame.apply(f, axis=1))

print df1
print df2
print df1 + df2
print

print '数据填充'
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns = list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns = list('abcde'))
print df1
print df2
print df1.add(df2, fill_value = 0)
print df1.reindex(columns = df2.columns, fill_value = 0)
print

print 'DataFrame与Series之间的操作'
arr = np.arange(12.).reshape((3, 4))
print arr
print arr[0]
print arr - arr[0]
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
print frame
print series
print frame - series
series2 = Series(range(3), index = list('bef'))
print frame + series2
series3 = frame['d']
print frame.sub(series3, axis = 0)  # 按列减
Пример #30
0
from pandas import DataFrame, Series
import pandas as pd

############################## basic functions #################################
# basic functions: get partly info
data[['Director','id','Gerne','Runtime']]
data.ix[['Crazy Asian','Movie 2'],['Director','Runtime']]

# basic functions:math oprations
df1 = DataFrame(np.arange(12.).reshape((3,4)), columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape(4,5), columns=list('abcde'))
df1-df2 # auto matching, fill NaN
df1.add(df2, fill_value=0)
df1.sub(df2, fill_value=0)
df1.mul(df2, fill_value=0)
df1.div(df2, fill_value=0)

df1.reindex(columns=df2.columns, fill_value=0)

# function map
df1 = DataFrame(np.random.randn(4,3), columns=list('abc'), index=['id1','id2','id3','id4'])
np.abs(df1)

func1 = lambda x: x.max()-x.min()
df1.apply(f, axis=1)

def f(x):
    return ([x.max(),x.min()],index=['max','min'])
df1.apply(f)

format11 = lambda x: '%.2f' % x
Пример #31
0
series1 = Series(np.arange(4))
series2 = Series(np.arange(1, 5))
print(series1 + series2)
print(series1 - series2)

# DataFrame + DataFrame
df1 = DataFrame(np.arange(16).reshape((4, 4)),
                columns=list('abcd'),
                index=np.arange(4))
df2 = DataFrame(np.arange(1, 17).reshape((4, 4)),
                columns=list('bcde'),
                index=np.arange(1, 5))
print(df1 + df2)
print(df1.add(df2, fill_value=0))
print(df1 - df2)
print(df2.sub(df1, fill_value=0))

# DataFrame + Series
df = DataFrame(np.arange(12).reshape((4, 3)),
               columns=list('abc'),
               index=np.arange(4))
series = Series(np.arange(3), index=df.columns)
print(df + series)
series = series.reindex(list('bcd'))
print(df + series)
series = df['a']
print(df.sub(series, axis=0))

# function applying
df = DataFrame({'Henry': {'math': 50, 'english': 80, 'phylosophy': 70, 'biology': 20},
                'Kate': {'math': 70, 'biology': 40, 'english': 90, 'phylosophy': 8},
Пример #32
0
frame6_1 + frame6_2
frame6_1.add(frame6_2, fill_value=0)  #傳入frame6_2的數值與一個fill_value參數
frame6_1.reindex(columns=frame6_2.columns, fill_value=0)  #再重新索引時,可以指定填充

#DataFrame跟Series之間的運算
yc10 = np.arange(12.).reshape((3, 4))
yc10
yc10 - yc10[1]

frame7 = DataFrame(np.arange(12.).reshape((4, 3)),
                   columns=list('abc'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series_7 = frame7.iloc[0]
frame7 - series_7  #每一層都會減掉,叫做廣播broadcasting
series_7_1 = frame7['b']
frame7.sub(series_7_1, axis=0)

##函數應用與映射
frame8 = DataFrame(np.random.randn(4, 3),
                   columns=list('bde'),
                   index=['Ohio', 'Utah', 'Texas', 'Oregon'])
frame8
frame8.abs()  # = np.abs(frame8) 取絕對值

f = lambda x: x.max() - x.min()
frame8.apply(f)
frame8.apply(f, axis=1)


def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
Пример #33
0
print df1
print
print df2
print
print df1.add(df2, fill_value = 0)
print df1.reindex(columns = df2.columns, fill_value = 0)
print

print 'DataFrame与Series之间的操作'
arr = np.arange(12.).reshape((3, 4))
print arr
print arr[0]
print arr - arr[0]
frame = DataFrame(np.arange(12).reshape((4, 3)),
                  columns = list('bde'),
                  index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
print frame
print series
print
print frame - series
print
print frame
print
series2 = Series(range(3), index = list('bef'))
print frame + series2
series3 = frame['d']
print series3
print
print frame.sub(series3, axis = 0)  # 按列减
Пример #34
0
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
print s1 + s2

df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                index=['Ohio', 'Texas', 'Colorado'])
print df1
df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print df2
print df1 + df2
print df1.add(df2, fill_value=0)

arr = np.arange(12.).reshape((3, 4))
print arr - arr[0]

frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
series2 = Series(range(3), index=['b', 'e', 'f'])
print frame + series2

series3 = frame['d']
print frame.sub(series3, axis=0)
series4 = frame.ix['Utah']
print frame.sub(series4, axis=1)
Пример #35
0
def get_diff_dataframe(df: pd.DataFrame, df_pre: pd.DataFrame):
    diff = df.sub(df_pre, fill_value=0)
    return diff
Пример #36
0
    def test_column_dups_indexing(self):
        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # boolean indexing
        # GH 4879
        dups = ['A', 'A', 'C', 'D']
        df = DataFrame(np.arange(12).reshape(3, 4), columns=[
                       'A', 'B', 'C', 'D'], dtype='float64')
        expected = df[df.C > 6]
        expected.columns = dups
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=dups, dtype='float64')
        result = df[df.C > 6]
        check(result, expected)

        # where
        df = DataFrame(np.arange(12).reshape(3, 4), columns=[
                       'A', 'B', 'C', 'D'], dtype='float64')
        expected = df[df > 6]
        expected.columns = dups
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=dups, dtype='float64')
        result = df[df > 6]
        check(result, expected)

        # boolean with the duplicate raises
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=dups, dtype='float64')
        self.assertRaises(ValueError, lambda: df[df.A > 6])

        # dup aligining operations should work
        # GH 5185
        df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
        df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
        expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
        result = df1.sub(df2)
        assert_frame_equal(result, expected)

        # equality
        df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]],
                        columns=['A', 'B'])
        df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]],
                        columns=['A', 'A'])

        # not-comparing like-labelled
        self.assertRaises(ValueError, lambda: df1 == df2)

        df1r = df1.reindex_like(df2)
        result = df1r == df2
        expected = DataFrame([[False, True], [True, False], [False, False], [
                             True, False]], columns=['A', 'A'])
        assert_frame_equal(result, expected)

        # mixed column selection
        # GH 5639
        dfbool = DataFrame({'one': Series([True, True, False],
                                          index=['a', 'b', 'c']),
                            'two': Series([False, False, True, False],
                                          index=['a', 'b', 'c', 'd']),
                            'three': Series([False, True, True, True],
                                            index=['a', 'b', 'c', 'd'])})
        expected = pd.concat(
            [dfbool['one'], dfbool['three'], dfbool['one']], axis=1)
        result = dfbool[['one', 'three', 'one']]
        check(result, expected)

        # multi-axis dups
        # GH 6121
        df = DataFrame(np.arange(25.).reshape(5, 5),
                       index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'C', 'D', 'E'])
        z = df[['A', 'C', 'A']].copy()
        expected = z.loc[['a', 'c', 'a']]

        df = DataFrame(np.arange(25.).reshape(5, 5),
                       index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'C', 'D', 'E'])
        z = df[['A', 'C', 'A']]
        result = z.loc[['a', 'c', 'a']]
        check(result, expected)
Пример #37
0
# s1의 0, 1, 2의 값이 df의 b d e에 모두 계산된다.

s2 = Series(range(3), index=list('bef'))
print(s2)

print(df + s2)

s3 = df['d']
print(s3)

print(df + s3)
# index가 완전히 새롭게 추가되는 경우이기 때문에 모두 NaN값이 뜨는 결과가 나온다.

# 행에 대한 연산을 수행해야 할 경우에는 함수를 이용한다. (add, sub 등) axis값을 주면 된다.
print(df.add(s3, axis=0))
print(df.sub(s3, axis=0))

# 함수 적용과 매핑
## 배열의 각 원소에 적용되는 함수를 유니버셜 함수라 한다.

# numpy.random 모듈에 있는 randn 함수는 임의의 정규분표 데이터를 생성한다.
df = DataFrame(np.random.randn(4, 3),
               columns=list('bde'),
               index=['seoul', 'busan', 'daegu', 'incheon'])
print(df)

print(np.abs(df))
#절대값으로 변환하는 함수

f = lambda x: x.max() - x.min()
Пример #38
0
print data.ix[:'Utha','two']
print '----print data.ix[data.three>5,:3]-----'
print data.ix[data.three>5,:3]


print "#-------广播------#"
frame = DataFrame(np.arange(12.).reshape(4,3),
				index=['Utah','Ohio','Texas','Oregon'],
				columns=list('bde')
				)
print frame		

series = frame['d']
print series
print frame.state['Utah']
print frame.sub(series,axis=0)		#默认广播方式是将series当做一行来减,即按列广播  沿着行一直向下广播














Пример #39
0
def practice_two():
    # 重新索引      reindex
    obj = Series(['b', 'p', 'y'], index=[0, 2, 4])
    obj.reindex(range(6), method='ffill')
    '''
    ffill   前向填充值
    bfill   后向填充值
    pad     前向搬运值
    backfill    后向搬运值
    '''

    frame = DataFrame(np.arange(9).reshape((3, 3)),
                      index=['a', 'c', 'd'],
                      columns=['Ohio', 'Texas', 'California'])
    # 3行3列的数组,行索引为index,列索引为columns
    frame2 = frame.reindex(['a', 'b', 'c', 'd'])  # 添加索引为b这一行
    states = ['Texas', 'Utah', 'California']
    frame.reindex(columns=states)  # 使用columns可重新索引列
    '''
    reindex函数的参数
        index       用作索引的新序列
        method      插值方式
        fill_value  重新索引的过程中,需要引入缺失值时使用的代替值
        limit       前向或后向填充时的最大填充量
        level       在Multilndex的指定级别上匹配简单索引,否则取其子集
        copy        默认True,无论如何都复制;若为False,则新旧相等不复制
    '''

    # 丢弃指定轴上的项
    obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
    obj.drop('c')  # 删除c行
    obj.drop(['d', 'c'])  # 删除d,c行
    data = DataFrame(np.arange(16).reshape((4, 4)),
                     index=['o', 'c', 'u', 'n'],
                     columns=['one', 'two', 'three', 'four'])
    data.drop(['two', 'four'], axis=1)  # 删除列,two,four

    # 索引,选取,过滤
    obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
    obj['b']  # 等价于obj[1]
    obj[2:4]
    obj[['b', 'a', 'd']]
    obj[[1, 3]]
    obj[obj < 2]
    obj['b':'c']
    obj['b':'c'] = 5  # 修改值
    '''
    DataFrame的索引选项
        obj[val]          选取单列或一组列
        obj.ix[val]       单行或一组行
        obj.ix[val1, val2]  同时选取行和列
        reindex方法       将一个或多个轴匹配到新索引
        xs方法            根据标签选取单行或单列,返回Series
        icol,irow方法     根据整数位置选取单列或单行,返回Series
        get_value,set_value方法   根据行标签和列标签选取单个值
    '''

    # 算术运算和数据对齐
    s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
    s2 = Series([-2.1, 3.6, -1.5, 4, 3.1],
                index=['a', 'c', 'd', 'e', 'f', 'g'])
    s1 + s2  # 在不重叠的索引处引入NA值
    # 同样会发生在DataFrame上
    s1.add(s2, fill_value=0)  # 不会出现NA值,单纯加
    s1.reindex(columns=s2.columns, fill_value=0)  # 指定值
    '''
    add     +
    sub     -
    div     /
    mul     *
    '''

    frame = DataFrame(np.arange(12.).reshape((3, 4)),
                      columns=list('bde'),
                      index=['U', 'O', 'T', 'R'])
    series = frame.ix[0]
    frame - series
    series2 = Series(range(3), index=['b', 'e', 'f'])
    frame + series2  # 会出现NA值
    series3 = frame['d']
    frame.sub(series3, axis=0)

    # 函数应用与映射
    frame = DataFrame(np.random.randn(3, 4),
                      columns=list('bde'),
                      index=['U', 'O', 'T', 'R'])
    np.abs(frame)  # 绝对值
    f = lambda x: x.max() - x.min()
    frame.apply(f)
    frame.apply(f, axis=1)
    format = lambda x: '%.2f' % x
    frame.applymap(format)
    frame['e'].map(format)

    # 排序和排名
    '''
    .sort_index()       按字典顺序排序     行
    .sort_index(axis=1)         列
    .sort_index(ascending=False)        降序,默认升
    .order()            对Series
    .sort_index(by='*')         针对*列
    .rank(ascending=False,method='first',axis=1)
    # 'average' 默认,平均   'min' 最小    'max' 最大    'first' 按值在原始出现顺序分配排名
    '''

    # 带有重复值的轴索引
    obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
    obj.index.is_unique  # 值是否唯一

    pass
Пример #40
0
    def test_column_dups_indexing(self):
        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # boolean indexing
        # GH 4879
        dups = ['A', 'A', 'C', 'D']
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=['A', 'B', 'C', 'D'],
                       dtype='float64')
        expected = df[df.C > 6]
        expected.columns = dups
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=dups,
                       dtype='float64')
        result = df[df.C > 6]
        check(result, expected)

        # where
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=['A', 'B', 'C', 'D'],
                       dtype='float64')
        expected = df[df > 6]
        expected.columns = dups
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=dups,
                       dtype='float64')
        result = df[df > 6]
        check(result, expected)

        # boolean with the duplicate raises
        df = DataFrame(np.arange(12).reshape(3, 4),
                       columns=dups,
                       dtype='float64')
        msg = "cannot reindex from a duplicate axis"
        with pytest.raises(ValueError, match=msg):
            df[df.A > 6]

        # dup aligning operations should work
        # GH 5185
        df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
        df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
        expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
        result = df1.sub(df2)
        assert_frame_equal(result, expected)

        # equality
        df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]],
                        columns=['A', 'B'])
        df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]],
                        columns=['A', 'A'])

        # not-comparing like-labelled
        msg = "Can only compare identically-labeled DataFrame objects"
        with pytest.raises(ValueError, match=msg):
            df1 == df2

        df1r = df1.reindex_like(df2)
        result = df1r == df2
        expected = DataFrame(
            [[False, True], [True, False], [False, False], [True, False]],
            columns=['A', 'A'])
        assert_frame_equal(result, expected)

        # mixed column selection
        # GH 5639
        dfbool = DataFrame({
            'one':
            Series([True, True, False], index=['a', 'b', 'c']),
            'two':
            Series([False, False, True, False], index=['a', 'b', 'c', 'd']),
            'three':
            Series([False, True, True, True], index=['a', 'b', 'c', 'd'])
        })
        expected = pd.concat([dfbool['one'], dfbool['three'], dfbool['one']],
                             axis=1)
        result = dfbool[['one', 'three', 'one']]
        check(result, expected)

        # multi-axis dups
        # GH 6121
        df = DataFrame(np.arange(25.).reshape(5, 5),
                       index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'C', 'D', 'E'])
        z = df[['A', 'C', 'A']].copy()
        expected = z.loc[['a', 'c', 'a']]

        df = DataFrame(np.arange(25.).reshape(5, 5),
                       index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'C', 'D', 'E'])
        z = df[['A', 'C', 'A']]
        result = z.loc[['a', 'c', 'a']]
        check(result, expected)
Пример #41
0
RF = ensemble.RandomForestRegressor()

trainY = DataFrame(trainSample["SalePrice"])
trainX = trainSample.drop(columns="SalePrice")
testY = DataFrame(testSample["SalePrice"])
testX = testSample.drop(columns="SalePrice")

BR.fit(trainX,trainY.values.ravel())
PC.fit(trainX,trainY.values.ravel())
RF.fit(trainX,trainY.values.ravel())
SVR.fit(trainX,trainY.values.ravel())

resultBR = BR.predict(testX)
resultPC = PC.predict(testX)
resultSVR = SVR.predict(testX)
resultRF = RF.predict(testX)

resultBRdf = DataFrame(resultBR, columns=["SalePrice"])
differenceBR = resultBRdf.sub(testY.reset_index(drop=True))
resultPCdf = DataFrame(resultPC, columns=["SalePrice"])
differencePC = resultPCdf.sub(testY.reset_index(drop=True))
resultRFdf = DataFrame(resultRF, columns=["SalePrice"])
differenceRF = resultRFdf.sub(testY.reset_index(drop=True))
resultSVRdf = DataFrame(resultSVR, columns=["SalePrice"])
differenceSVR = resultSVRdf.sub(testY.reset_index(drop=True))

print(metrics.mean_absolute_error(resultBR, testY.reset_index(drop=True)))
print(metrics.mean_absolute_error(resultPC, testY.reset_index(drop=True)))
print(metrics.mean_absolute_error(resultRF, testY.reset_index(drop=True)))
print(metrics.mean_absolute_error(resultSVR, testY.reset_index(drop=True)))