Пример #1
0
    def corr_vars(self, xarr2):
        from pandas import DataFrame

        xarr1 = self._obj.copy()
        assert (
            xarr1.shape == xarr2.shape
        ), "The input DataArray must be the same size as {}".format(self.name)

        xarr3 = xarr1[:1].mean("time").copy()

        t, y, x = xarr1.shape

        df1 = DataFrame(xarr1.values.reshape(t, y * x))
        df2 = DataFrame(xarr2.values.reshape(t, y * x))

        dfcor = df1.corrwith(df2).values.reshape(y, x)
        xarr3.values = dfcor

        xarr3.attrs["long_name"] = "Correlation of %s and %s" % (
            xarr1.name,
            xarr2.name,
        )
        xarr3.name = "corr_%s_vs_%s" % (xarr1.name, xarr2.name)

        xarr3.encoding.update({"zlib": True, "shuffle": True, "complevel": 4})

        return xarr3
Пример #2
0
def df_error_analysis(dfA: pd.DataFrame, dfB: pd.DataFrame, **kwargs):
    """ 
    两个DataFrame的偏差分析,注意:两个DataFrame的列和索引应该一致
    调用案例:df_error_analysis(dfA,dfB,col=['open'])
    """
    col = kwargs.pop('col', None)
    #皮尔森相关系数
    corr_method = kwargs.pop('corr_method', 'pearson')

    # Find their differences
    diff = dfA - dfB

    df = diff.describe()
    #均方差、平均绝对偏差、平均值的无偏标准误差、皮尔森相关系数
    extra = pd.DataFrame([
        diff.var(),
        diff.mad(),
        diff.sem(),
        dfA.corrwith(dfB, method=corr_method)
    ],
                         index=['var', 'mad', 'sem', 'corr'])

    # Append the differences to the DataFrame
    df = df.append(extra)

    if col is not None:
        return df[col]
    else:
        return df
Пример #3
0
def stocks_corr_analyzation(days, stock, *compare_stocks):
    ''' analyze the pct_change corr between stocks '''
    compare_stock_data = {}
    for s in compare_stocks:
        try:
            compare_stock_data[s.name] = get_recent_data(s.code,
                                                         s.market_code,
                                                         days,
                                                         update=False)
        except Exception as e:
            print_err(e)

    df_compare_stocks_pct = DataFrame(
        {stock: df['Adj Close']
         for stock, df in compare_stock_data.items()}).pct_change()

    if stock is not None:
        if stock in compare_stocks:
            ser_stock_pct = df_compare_stocks_pct[stock.name]
            df_compare_stocks_pct.drop(columns=stock.name, inplace=True)
        else:
            df = get_recent_data(stock.code,
                                 stock.market_code,
                                 days,
                                 update=False)
            ser_stock_pct = df['Adj Close'].pct_change()

        ser = df_compare_stocks_pct.corrwith(ser_stock_pct)
        # 筛选出0.5以上的,并格式化
        ser = ser[ser >= 0.5].map(lambda x: '{:.2f}'.format(x))
        ser.name = '涨跌相关性'
        return ser.to_frame()
    else:
        df = df_compare_stocks_pct.corr()
        return df.applymap(lambda x: '{:.2f}'.format(x))
Пример #4
0
def top_correlated_features(df: DataFrame, target_feature, n=5):
    """
    Returns the names of features most strongly correlated (correlation is
    close to 1 or -1) with a target feature. Correlation is Pearson's-r sense.

    :param df: A pandas dataframe.
    :param target_feature: The name of the target feature.
    :param n: Number of top features to return.
    :return: A tuple of
        - top_n_features: Sequence of the top feature names
        - top_n_corr: Sequence of correlation coefficients of above features
        Both the returned sequences should be sorted so that the best (most
        correlated) feature is first.
    """

    # TODO: Calculate correlations with target and sort features by it

    # ====== YOUR CODE: ======
    table = df.corrwith(other=df[target_feature], method='pearson')
    table_abs = table.abs()
    top_n_features = table_abs.nlargest(n=n + 1).sort_values(ascending=False)
    top_n_features = [feature[0] for feature in top_n_features[1:].items()]
    top_n_corr = [table.get(f) for f in top_n_features]
    # ========================

    return top_n_features, top_n_corr
Пример #5
0
    def test_corrwith(self, datetime_frame):
        a = datetime_frame
        noise = Series(np.random.randn(len(a)), index=a.index)

        b = datetime_frame.add(noise, axis=0)

        # make sure order does not matter
        b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
        del b["B"]

        colcorr = a.corrwith(b, axis=0)
        tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"]))

        rowcorr = a.corrwith(b, axis=1)
        tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))

        dropped = a.corrwith(b, axis=0, drop=True)
        tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"]))
        assert "B" not in dropped

        dropped = a.corrwith(b, axis=1, drop=True)
        assert a.index[-1] not in dropped.index

        # non time-series data
        index = ["a", "b", "c", "d", "e"]
        columns = ["one", "two", "three", "four"]
        df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns)
        df2 = DataFrame(np.random.randn(4, 4),
                        index=index[:4],
                        columns=columns)
        correls = df1.corrwith(df2, axis=1)
        for row in index[:4]:
            tm.assert_almost_equal(correls[row],
                                   df1.loc[row].corr(df2.loc[row]))
Пример #6
0
def target_correlation(features: pd.DataFrame,
                       target: DataType,
                       method: str = "pearson") -> pd.Series:
    """
    Calculate target_correlation between features and target and returns a sorted pd.Series

    Parameters
    ----------
    features: pd.DataFrame
        Features to calculate target_correlation for
    target: np.ndarray or pd.Series
        Target variable
    method: str
        Which correlation to use. One of 'pearson', 'spearman', 'kendall'

    Returns
    -------
    pd.Series
        Series of feature importance sorted by absolute value


    """
    if isinstance(target, np.ndarray):
        target = pd.Series(target)

    corr = features.corrwith(target, method=method)
    sorted_idx = np.argsort(corr.abs())
    return corr[sorted_idx]
Пример #7
0
    def test_corrwith_index_union(self):
        df1 = DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"])
        df2 = DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"])

        result = df1.corrwith(df2, drop=False).index.sort_values()
        expected = df1.columns.union(df2.columns).sort_values()
        tm.assert_index_equal(result, expected)
Пример #8
0
 def test_corrwith_mixed_dtypes(self, numeric_only):
     # GH#18570
     df = DataFrame({
         "a": [1, 4, 3, 2],
         "b": [4, 6, 7, 3],
         "c": ["a", "b", "c", "d"]
     })
     s = Series([0, 6, 7, 3])
     if numeric_only:
         result = df.corrwith(s, numeric_only=numeric_only)
         corrs = [df["a"].corr(s), df["b"].corr(s)]
         expected = Series(data=corrs, index=["a", "b"])
         tm.assert_series_equal(result, expected)
     else:
         with pytest.raises(TypeError,
                            match="not supported for the input types"):
             df.corrwith(s, numeric_only=numeric_only)
Пример #9
0
    def test_corrwith_matches_corrcoef(self):
        df1 = DataFrame(np.arange(10000), columns=["a"])
        df2 = DataFrame(np.arange(10000)**2, columns=["a"])
        c1 = df1.corrwith(df2)["a"]
        c2 = np.corrcoef(df1["a"], df2["a"])[0][1]

        tm.assert_almost_equal(c1, c2)
        assert c1 < 1
Пример #10
0
def cat_corr_heatmap(
    *,
    data: pd.DataFrame,
    categorical: str,
    transpose: bool = False,
    high_corr: float = None,
    scale: float = 0.5,
    no_prefix: bool = True,
    ax: plt.Axes = None,
    **kwargs,
) -> plt.Axes:
    """Plot a correlation heatmap of categorical vs. numeric features.

    Args:
        data (DataFrame): Frame containing categorical and numeric data.
        categorical (str): Name or list of names of categorical features.
        high_corr (float): Threshold for high correlation. Defaults to None.
        scale (float, optional): Multiplier for determining figsize. Defaults to 0.5.
        no_prefix (bool, optional): If only one cat, do not prefix dummies. Defaults to True.
        ax (Axes, optional): Axes to plot on. Defaults to None.

    Returns:
        Axes: Axes of the plot.
    """
    if isinstance(categorical, str):
        ylabel = utils.to_title(categorical)
        categorical = [categorical]
        single_cat = True
    else:
        ylabel = "Categorical Features"
        single_cat = False
    title = "Correlation with Numeric Features"
    cat_df = data.filter(categorical, axis=1)
    if no_prefix and single_cat:
        dummies = pd.get_dummies(cat_df, prefix="", prefix_sep="")
    else:
        dummies = pd.get_dummies(cat_df)
    corr_df = dummies.apply(lambda x: data.corrwith(x))
    if not transpose:
        corr_df = corr_df.T
    if high_corr is not None:
        if "annot" not in kwargs or kwargs.get("annot"):
            kwargs["annot"] = corr_df.values
        corr_df = corr_df.abs() > high_corr
        kwargs["center"] = None
        title = f"High {title}"
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize_like(corr_df, scale=scale))
    style = dict(HEATMAP_STYLE)
    style.update(kwargs)
    ax = sns.heatmap(corr_df, ax=ax, **style)
    xlabel = "Numeric Features"
    if transpose:
        xlabel, ylabel = ylabel, xlabel
    ax.set_xlabel(xlabel, labelpad=10)
    ax.set_ylabel(ylabel, labelpad=10)
    ax.set_title(title, pad=10)
    return ax
Пример #11
0
def correlate(df: pd.DataFrame, response: str):
    """Returns the correlation matrix for a dataframe"""
    df = df[df.columns[df.nunique() > 1]].copy()
    if response not in df.columns:
        df[response] = np.nan
    series = df[response]
    df = df.drop(columns=[response])
    corrdf = df.corrwith(series)
    return corrdf.reindex(corrdf.abs().sort_values().index)
Пример #12
0
    def test_corrwith_dup_cols(self):
        # GH#21925
        df1 = DataFrame(np.vstack([np.arange(10)] * 3).T)
        df2 = df1.copy()
        df2 = pd.concat((df2, df2[0]), axis=1)

        result = df1.corrwith(df2)
        expected = Series(np.ones(4), index=[0, 0, 1, 2])
        tm.assert_series_equal(result, expected)
Пример #13
0
def generate_recommendation(movie_matrix: pd.DataFrame, movie_title: str,
                            year: str) -> pd.DataFrame:
    user_rating = movie_matrix[f'{movie_title} ({year})']
    user_rating = user_rating.dropna()

    similar = movie_matrix.corrwith(user_rating)
    corr = pd.DataFrame(similar, columns=['correlation'])

    corr.dropna(inplace=True)
    return corr
Пример #14
0
 def test_corrwith_mixed_dtypes(self):
     # GH#18570
     df = DataFrame(
         {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
     )
     s = Series([0, 6, 7, 3])
     result = df.corrwith(s)
     corrs = [df["a"].corr(s), df["b"].corr(s)]
     expected = Series(data=corrs, index=["a", "b"])
     tm.assert_series_equal(result, expected)
Пример #15
0
def boolean_violinplots(
    crosstab: pd.DataFrame,
    y_series: pd.Series,
    suptitle: str,
    xlabels: list = None,
    ylabel: str = None,
    include: list = None,
    figsize: tuple = (12, 8),
    **kwargs,
) -> np.array:
    """Create multiple violin plots showing distributions for True and False.

    Args:
        crosstab (pd.DataFrame): Crosstab frequency table for categorical variables.
        y_series (pd.Series): Data for y-axis.
        suptitle (str): Figure title.
        xlabels (list, optional): Labels for x-axes. Defaults to None.
        ylabel (str, optional): Label for y-axis. Defaults to None.
        include (list, optional): Columns of `crosstab` to plot. Defaults to None.
        figsize (tuple, optional): Figure size. Defaults to (12, 8).

    Returns:
        np.array: Array of Axes.
    """
    ncols = 2
    nrows = int(np.ceil(crosstab.shape[1] / 2))
    if include:
        crosstab = crosstab.loc[:, include]
        nrows = int(np.ceil(len(include) / 2))
    corr = crosstab.corrwith(y_series)
    fig, axs = plt.subplots(
        nrows=nrows, ncols=ncols, sharey=True, figsize=figsize
    )
    for i, ax in enumerate(axs.flat):
        ax = sns.violinplot(x=crosstab.iloc[:, i], y=y_series, ax=ax, **kwargs)
        ax.set_ylabel(None)
        if xlabels:
            ax.set_xlabel(xlabel[i])
        cat_corr = np.round(corr.iloc[i], 2)
        text = f"Corr: {cat_corr}"
        ax.text(
            0.975,
            1.025,
            text,
            horizontalalignment="right",
            verticalalignment="center",
            transform=ax.transAxes,
            fontsize=12,
        )
    if ylabel:
        for ax in axs[:, 0]:
            ax.set_ylabel(ylabel, labelpad=10)
    fig.suptitle(suptitle)
    fig.tight_layout()
    return axs
Пример #16
0
 def __init__(self, data: pd.DataFrame, scores: pd.DataFrame, weights: pd.DataFrame, odm: pd.DataFrame,
              r_squared: pd.Series):
     self.__crossloadings = scores.apply(lambda s: data.corrwith(s))
     loading = (self.__crossloadings * odm).sum(axis=1).to_frame(name="loading")
     communality = loading.apply(lambda s: pow(s, 2))
     communality.columns = ["communality"]
     r_squared_aux = odm.dot(pd.DataFrame(np.diag(r_squared), index=r_squared.index, columns=r_squared.index)).sum(
         axis=1).to_frame(name="communality")
     redundancy = communality * r_squared_aux
     redundancy.columns = ["redundancy"]
     self.__outer_model = pd.concat([weights, loading, communality, redundancy], axis=1, sort=True)
Пример #17
0
def correlate(df: pd.DataFrame, response: str) -> pd.Series:
    """Returns the correlation matrix for a dataframe

    This function is the same as in ParameterAnalysis and could be generalized
    """
    df = df[df.columns[df.nunique() > 1]].copy()
    if response not in df.columns:
        df[response] = np.nan
    series = df[response]
    df = df.drop(columns=[response])
    corrdf = df.corrwith(series)
    return corrdf.reindex(corrdf.abs().sort_values().index)
Пример #18
0
def correlate_response_with_dataframe(
        df: pd.DataFrame,
        response: str,
        corrwith: Optional[list] = None) -> pd.Series:
    """Returns the correlation matrix for a dataframe soprted by correlation"""
    df = df[corrwith +
            [response]].copy() if corrwith is not None else df.copy()
    df = df[df.columns[df.nunique() > 1]]
    if response not in df.columns:
        df[response] = np.nan
    series = df[response]
    df = df.drop(columns=[response])
    corrdf = df.corrwith(series)
    return corrdf.reindex(corrdf.abs().sort_values().index)
Пример #19
0
def cat_correlation(crosstab: pd.DataFrame, other: pd.Series, **kwargs) -> Axes:
    """Make a heated bar plot of the correlation between a crosstab and `other`.

    Args:
        crosstab (pd.DataFrame): Crosstab frequency table for categorical variables.
        other (pd.Series): Data for correlation. Must share index with `crosstab`.

    Returns:
        Axes: Axes for the plot.
    """
    corr = crosstab.corrwith(other).dropna().sort_values(ascending=False)
    ax = heated_barplot(corr, **kwargs)
    ax.set_xlabel("Correlation", labelpad=15)
    return ax
Пример #20
0
    def corr(self, item_name: str, comparanda: DataFrame) -> DataFrame:
        """

        Returns correlations between a supplied vector of a given item and those of all other items.

        :param item_name: name of item
        :param comparanda: DataFrame consisting of numerical vectors each associated to a particular item (e.g., item_matrix_training if we're comparing user ratings of differnet items, and latent_content_features if we're comparing latent content features of different items)
        :param ratings: architecture-specific ratings

        """

        #vector of ratings for input item

        item_ratings = comparanda[item_name]

        #checks if entire item_ratings vector originated with empty values (i.e., every value is 2.5 at this point), set all values of output to 0 if so

        if (item_ratings == 2.5).all():

            similarity_vector = DataFrame({
                'Title':
                comparanda.columns,
                'Ratings_count':
                pd.Series(self.ratings['Number_of_ratings'][x]
                          for x in comparanda.columns)
            })
            similarity_vector['Similarity'] = pd.Series(
                [0 for x in range(len(similarity_vector.index))],
                index=similarity_vector.index)

        #else compute matrix of correlations between given item and other items, minus missing values

        else:

            similarity_vector = DataFrame({
                'Title':
                comparanda.columns,
                'Similarity':
                comparanda.corrwith(item_ratings),
                'Ratings_count':
                pd.Series(self.ratings['Number_of_ratings'][x]
                          for x in comparanda.columns)
            })
            similarity_vector = similarity_vector.sort_values(
                by=["Similarity", "Ratings_count"], ascending=False)
            similarity_vector = similarity_vector[1:]

        return similarity_vector
Пример #21
0
def broad_corr(frame: pd.DataFrame, other: pd.DataFrame) -> pd.DataFrame:
    """Get correlations between features of one frame with those of another.

    Parameters
    ----------
    frame : DataFrame
        First DataFrame.
    other : DataFrame
        Second DataFrame.

    Returns
    -------
    DataFrame
        Pearson correlations.
    """
    return other.apply(lambda x: frame.corrwith(x))
Пример #22
0
def corr_vars(xarr1, xarr2):
    from pandas import DataFrame
    xarr3 = xarr1[:1].mean('time').copy()

    t, y, x = xarr1.shape

    df1 = DataFrame(xarr1.values.reshape(t, y * x))
    df2 = DataFrame(xarr2.values.reshape(t, y * x))

    dfcor = df1.corrwith(df2).values.reshape(y, x)
    xarr3.values = dfcor

    xarr3.attrs['long_name'] = 'Correlation of %s and %s' % (xarr1.name, xarr2.name)
    xarr3.name = 'corr_%s_vs_%s' % (xarr1.name, xarr2.name)

    xarr3.encoding.update({'zlib': True, 'shuffle': True, 'complevel': 4})

    return xarr3
Пример #23
0
def cross_correlate(df: pd.DataFrame, relate_to_series: pd.Series, lag_idx=0):
    """Calculate cross correlation for a given lag.

    It is recommended to either have a lot of data in the data frame, or to use a short time frame for the lags,
    as the results are unstable if too few data points overlap in the time shifted time series.

    Args:
        df (pandas.Series): Time series data to correlate with some series
        relate_to_series (pandas.Series): Pandas Series with time series data to relate df to. Must have the same
            temporal spacing as df.
        lag_idx (int): How many indices to move the DataFrame in relation to the series.

    Returns:
        pandas.DataFrame: Pandas DataFrame containing the cross correlations of the columns.

    Examples:
        >>> df = pd.DataFrame({'x': (1, 7, 3, 5), 'y': (3, 7, 6, 4)})
        >>> cross_correlate(df, df['x'], lag_idx=1)
        x   -0.981981
        y   -0.960769
    """

    correlations = df.corrwith(relate_to_series.shift(lag_idx))
    return correlations
Пример #24
0
    def corr_vars(self, xarr2):
        from pandas import DataFrame

        xarr1 = self._obj.copy()
        assert xarr1.shape == xarr2.shape, 'The input DataArray must be the same size as {}'.format(
            self.name)

        xarr3 = xarr1[:1].mean('time').copy()

        t, y, x = xarr1.shape

        df1 = DataFrame(xarr1.values.reshape(t, y * x))
        df2 = DataFrame(xarr2.values.reshape(t, y * x))

        dfcor = df1.corrwith(df2).values.reshape(y, x)
        xarr3.values = dfcor

        xarr3.attrs['long_name'] = 'Correlation of %s and %s' % (xarr1.name,
                                                                 xarr2.name)
        xarr3.name = 'corr_%s_vs_%s' % (xarr1.name, xarr2.name)

        xarr3.encoding.update({'zlib': True, 'shuffle': True, 'complevel': 4})

        return xarr3
Пример #25
0
def corr_mat(
    data: pd.DataFrame,
    split: Optional[
        str] = None,  # Optional[Literal['pos', 'neg', 'high', 'low']] = None,
    threshold: float = 0,
    target: Optional[Union[pd.DataFrame, pd.Series, np.ndarray, str]] = None,
    method:
    str = "pearson",  # Literal['pearson', 'spearman', 'kendall'] = "pearson",
    colored: bool = True,
) -> Union[pd.DataFrame, Any]:
    """ Returns a color-encoded correlation matrix.

    Parameters
    ----------
    data : pd.DataFrame
        2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \
        index/column information is used to label the plots
    split : Optional[str], optional
        Type of split to be performed, by default None
        {None, "pos", "neg", "high", "low"}
    threshold : float, optional
        Value between 0 and 1 to set the correlation threshold, by default 0 unless split = "high" \
        or split = "low", in which case default is 0.3
    target : Optional[Union[pd.DataFrame, str]], optional
        Specify target for correlation. E.g. label column to generate only the correlations between each \
        feature and the label, by default None
    method : str, optional
        method: {"pearson", "spearman", "kendall"}, by default "pearson"
        * pearson: measures linear relationships and requires normally distributed and homoscedastic data.
        * spearman: ranked/ordinal correlation, measures monotonic relationships.
        * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more \
            expensive but more robust in smaller dataets than "spearman"
    colored : bool, optional
        If True the negative values in the correlation matrix are colored in red, by default True

    Returns
    -------
    Union[pd.DataFrame, pd.Styler]
        If colored = True - corr: Pandas Styler object
        If colored = False - corr: Pandas DataFrame
    """

    # Validate Inputs
    _validate_input_range(threshold, "threshold", -1, 1)
    _validate_input_bool(colored, "colored")

    def color_negative_red(val):
        color = "#FF3344" if val < 0 else None
        return "color: %s" % color

    data = pd.DataFrame(data)

    if isinstance(target, (str, list, pd.Series, np.ndarray)):
        target_data = []
        if isinstance(target, str):
            target_data = data[target]
            data = data.drop(target, axis=1)

        elif isinstance(target, (list, pd.Series, np.ndarray)):
            target_data = pd.Series(target)
            target = target_data.name

        corr = pd.DataFrame(data.corrwith(target_data, method=method))
        corr = corr.sort_values(corr.columns[0], ascending=False)
        corr.columns = [target]

    else:
        corr = data.corr(method=method)

    corr = _corr_selector(corr, split=split, threshold=threshold)

    if colored:
        return corr.style.applymap(color_negative_red).format("{:.2f}",
                                                              na_rep="-")
    else:
        return corr
Пример #26
0
        'Value': np.random.randn(M) / 200 + 0.08,
        'ShortInterest': np.random.randn(M) / 200 - 0.02
    },
    index=tickers[:M])

ind_names = np.array(['FINANCIAL', 'THCH'])
sampler = np.random.randint(0, len(ind_names), N)
industries = Series(ind_names[sampler], index=tickers, name='industry')

by_industry = df.groupby(industries)
df_stand = by_industry.apply(zscore)

ind_rank = by_industry.rank(ascending=False)
# print(tickers)
# print(df)
# print(industries)
# print(by_industry.mean())
# print(by_industry.describe())
# print(ind_rank)
# print(df_stand.groupby(industries).agg(['mean', 'std']))
fac1, fac2, fac3 = np.random.rand(3, 1000)

ticker_subset = tickers.take(np.random.permutation(N)[:1000])

port = Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + np.random.rand(1000),
              index=ticker_subset)

factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3}, index=ticker_subset)
print(factors.corrwith(port))
# print(pd.ols(y=port, x=factors).beta)
print(factors.tail())
Пример #27
0
# descriptive statistics
df = DataFrame([[1.4,np.nan],[7.1,-4.5],
               [np.nan,np.nan],[0.75,-1.3]],
               index=list('abcd'),
               columns=['one','two'])
df.describe() 
# skipna=True, mean, std, var, sum, 
# max, min, argmax, argmin, idxmax, idxmin,
# cumsum, cumprod, diff, pct_change

# Correlation and Covariance
df = DataFrame(np.random.randn(100,3), columns=list('abc'))
df.corr() 
df.cov()
df.corrwith(df['a'])

# unique values, value counts, membership
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
obj.value_counts()
mask = obj.isin(['b','c'])
obj[mask]

# deal with missing data
df = DataFrame(np.random.randn(7,3))
df.ix[:4,1] = np.nan; df.ix[:2,2] = np.nan

df.dropna(thresh=3)
df.fillna(0)
df.fillna({1:0.5,3:-1})
Пример #28
0
def _pc_volume_corr(df_basic_data):
    '''calculate the corr between price pct change and volume pct change'''
    pct_change = DataFrame(df_basic_data['Adj Close'].pct_change())
    return pct_change.corrwith(df_basic_data['Volume'].pct_change())
frame.ix['f'] = np.random.randn(4)
frame['loc'] = ['ST', 'MO'] * 3
frame.sort_index(axis=1)
frame.sort_values(by=['loc', 'STL'])
frame.rank(axis=0)
frame.rank(method='max')
um.order()
um.rank()
frame.add(frame2)
frame.corr(um)
frame.fillna(1, inplace='True')
um = frame['UM']
frame.corr()
frame.cov()
frame2.ix['f'] = np.random.randn(3)
frame.corrwith(frame2)
frame.corrwith(um)
frame.corrwith(um.to_frame())
frame.ix[:, 'Washu':'UMST'].apply(lambda x: x.mean())
frame.set_index('UM', drop=True, inplace=True)
keys = frame.index
frame.reset_index(level=keys)

df = DataFrame(np.random.randn(6, 5),
               columns=['Ohio', 'Dallas', 'Michigan', 'Miami', 'DC'],
               index=[['a', 'a', 'b', 'b', 'c', 'd'], [1, 2, 3, 1, 2, 3]])
df.index
df.ix['a']
df.sortlevel(level=0, axis=0)
df.sortlevel(level=1, axis=0)
df.swaplevel(0, 1)
Пример #30
0
N = 1000
tickers = np.array([rands(5) for _ in xrange(N)])

# 下面是一个投资组合, 由三个随机生成的因子(通常称为因子载荷)和一些权重构成
fac1, fac2, fac3 = np.random.rand(3, 1000)

ticker_subset = tickers.take(np.random.permutation(N)[:1000])

# 因子加权和以及噪声
port = Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + np.random.rand(1000),
              index=ticker_subset)
factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3},
                    index=ticker_subset)

print factors.corrwith(port)

# 计算因子暴露的标准方式是最小二乘回归
# 使用pandas.ols来计算整个投资组合的暴露
print pd.ols(y=port, x=factors).beta


# 通过groupby计算各行业的暴露量
def beta_exposure(chunk, factors=None):
    return pd.ols(y=chunk, x=factors).beta

ind_names = np.array(['FINANCIAL', 'TECH'])
sampler = np.random.randint(0, len(ind_names), N)
industries = Series(ind_names[sampler], index=tickers,
                    name='industry')
Пример #31
0
 def test_corrwith_kendall(self):
     # GH#21925
     df = DataFrame(np.random.random(size=(100, 3)))
     result = df.corrwith(df**2, method="kendall")
     expected = Series(np.ones(len(result)))
     tm.assert_series_equal(result, expected)
Пример #32
0
def display():
    av = avogadro  # 6.02214 * pow(10, 3)
    grav = gravity  # 9.8 Newton
    surface_pressure = 9.96921e+36  # initialization value from Netcdf files (variable: FillValue = 9.96921e+36)
    air_mass_kg = 0.0289654  # constant from : (https://en.wikipedia.org/wiki/Density_of_air)
    multiplication_factor_to_convert_to_molecules_percm2 = 6.022141E19
    november = 11
    december = 12
    ################################# Reading OGC standardized json files ####################################
    SP5 = input("Enter Sentinel-5P JSON-File Path:")
    EPA = input("Enter EPA JSON-File Path:")
    series_sp5 = pd.read_json(SP5)
    series_epa = pd.read_json(EPA)
    #################################Choose Output file of merged Data##################################################
    output = input("Enter Merged Output File Path:")
    #####################################Convert EPA timeseries Data to Dataframes #####################################
    for series_name in series_epa:
        pollutant_name = series_name
    var_epa = series_epa[pollutant_name]
    time_array_epa = []
    air_quality_value_epa = []
    for key in var_epa.keys():
        for (subkey, value) in zip(var_epa[key].keys(), var_epa[key].values()):
            if (subkey == "time"):
                time_array_epa.append(value["instant"])
            elif (subkey == "value"):
                air_quality_value_epa.append(value)

    data_epa = {
        'Date_Value': time_array_epa,
        'Pollutant_Value': air_quality_value_epa
    }
    json_to_dataframe_epa = DataFrame(
        data_epa, columns=['Date_Value', 'Pollutant_Value'])
    json_to_dataframe_epa.Date_Value = pd.to_datetime(
        json_to_dataframe_epa.Date_Value, utc=True)
    json_to_dataframe_epa.Date_Value = pd.to_datetime(
        json_to_dataframe_epa.Date_Value, unit="s")
    # convert string pollutant value to numeric
    json_to_dataframe_epa.Pollutant_Value = pd.to_numeric(
        json_to_dataframe_epa.Pollutant_Value)
    # epa data in hours
    json_to_dataframe_epa = json_to_dataframe_epa.set_index("Date_Value")

    #####################################Convert Sentinel 5-P timeseries Data to Dataframes ############################
    for series_name in series_sp5:
        pollutant_name = series_name
    var_sp5 = series_sp5[pollutant_name]
    time_array_sp5 = []
    air_quality_value_sp5 = []
    for key in var_sp5.keys():
        for (subkey, value) in zip(var_sp5[key].keys(), var_sp5[key].values()):
            if (subkey == "time"):
                time_array_sp5.append(value["instant"])
            elif (subkey == "surface_pressure"):
                surface_pressure = value["value"]
            elif (subkey == "value"):
                if (value != ""):
                    ##### Unit Conversion of Data from mol/m2 to ppb following TOBIAS criteria ########
                    ############ Link (https://search.proquest.com/docview/2117060744 #################
                    air_column = (float(surface_pressure) *
                                  avogadro) / (gravity * air_mass_kg)
                    value = multiplication_factor_to_convert_to_molecules_percm2 * float(
                        value) / air_column / 1E9
                air_quality_value_sp5.append(value)

    data_sp5 = {
        'Date_Value': time_array_sp5,
        'Pollutant_Value': air_quality_value_sp5
    }
    json_to_dataframe_sp5 = DataFrame(
        data_sp5, columns=['Date_Value', 'Pollutant_Value'])
    json_to_dataframe_sp5.Date_Value = pd.to_datetime(
        json_to_dataframe_sp5.Date_Value, utc=True)
    json_to_dataframe_sp5.Date_Value = pd.to_datetime(
        json_to_dataframe_sp5.Date_Value, unit="s")
    # convert string pollutant value to numeric
    json_to_dataframe_sp5.Pollutant_Value = pd.to_numeric(
        json_to_dataframe_sp5.Pollutant_Value)
    # s5p one value daily in seconds
    json_to_dataframe_sp5 = json_to_dataframe_sp5.set_index(
        "Date_Value").resample('D').mean()

    ############################ THE MERGE OF THE Sentinel5-P and EPA DATASETS########################################################
    combined_dataframe = json_to_dataframe_sp5.combine_first(
        json_to_dataframe_epa).sort_values(
            'Date_Value'
        )  # sorted and null in SP5 is filled with values from EPA
    #################################################Write Merged Data to File##########################################
    outfile = open(output, 'w')
    combined_dataframe.to_csv(outfile, sep='\t')
    ###########################################Plot of autocorrelation of merged dataset ###############################
    # lags = 28 is how many timesteps to be included in the plot
    # The auto correlation functions for the pollution and its first ,second and third order differences
    # axes parameter is used for number of graphs drawn ..first plot in axes 0, second in axes 1,..etc
    # To make an AR model it is important to check that it doesn't have auto correlation or autocorrelation decay
    # It should be to stationary process
    # plot with increasingly time delays
    series_1 = combined_dataframe[combined_dataframe.index.month == november]
    series_2 = DataFrame(
        combined_dataframe[combined_dataframe.index.month == december])
    fig, axes = plt.subplots(1, 3, figsize=(12, 3))
    #smg.tsa.plot_acf(series_1.Pollutant_Value, lags=28, ax=axes[0])
    smg.tsa.plot_acf(series_1.Pollutant_Value.diff().dropna(),
                     lags=28,
                     ax=axes[0])
    smg.tsa.plot_acf(series_1.Pollutant_Value.diff().diff().dropna(),
                     lags=28,
                     ax=axes[1])
    smg.tsa.plot_acf(series_1.Pollutant_Value.diff().diff().diff().dropna(),
                     lags=28,
                     ax=axes[2])
    plt.show()
    ##################### To be sure there are no invalid values and to drop them if exist##############################
    series_1[~series_1.isin([np.nan, np.inf, -np.inf]).any(1)]
    series_1.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
    model = sm.tsa.AR(series_1.Pollutant_Value.dropna())
    result = model.fit(288)  # fit to 3 days as the frequency is D
    ############ Use the result of Durbin Watson to show that there is no significant auto-correlation #################
    sm.stats.durbin_watson(result.resid)
    print("Durbin Watson")
    print(sm.stats.durbin_watson(result.resid))
    # plot the result of Durbin Watson to show that there is no significant auto-correlation
    fig, ax = plt.subplots(1, 1, figsize=(8, 3))
    # smg.tsa.plot_acf(result.resid, lags=72, ax=ax) for hours
    smg.tsa.plot_acf(result.resid, lags=27, ax=ax)
    plt.show()
    ####################################Air Quality Forecast############################################################
    air_quality_forecast = result.predict(
        start=288, end=480, dynamic=False)  # try for the modelling
    fig, ax = plt.subplots(1, 1, figsize=(12, 4))
    ax.plot(series_1.index.values[-288:],
            series_1.Pollutant_Value.values[-288:],
            label="train data")
    ax.plot(series_2.index.values[:288],
            series_2.Pollutant_Value.values[:288],
            label="actual data")
    ax.plot(pd.date_range("2018-12-01 00:00:00+00:00",
                          "2018-12-03 00:00:00+00:00",
                          freq="15min").values,
            air_quality_forecast,
            label="predicted outcome")
    sample = series_2['2018-12-01 00:00:00+00:00':'2018-12-03 00:00:00+00:00']
    ####################################Air Quality Forecast############################################################
    print("RMSE")
    print(sqrt(mean_squared_error(sample, air_quality_forecast)))
    #####################################Check Autocorrelation SP5 and EPA##############################################
    ax.legend()
    plt.show()
    print("Correlation sp5 and epa")
    print(
        json_to_dataframe_sp5.corrwith(json_to_dataframe_epa,
                                       method='pearson'))
Пример #33
0
    :param df1:
    :type df1: pandas.core.frame.DataFrame
    :param df2:
    :type df2: pandas.core.frame.DataFrame
    :return:
    :rtype: pandas.core.frame.DataFrame
    """
    res = []
    for i in range(df2.shape[1]):
        res.append(df1.corrwith(df2.ix[:, i]))
    res = pd.concat(res, axis=1)
    res.columns = df2.columns
    return res

pairwise_corr(df1, df3)
df1.corrwith(df3.h)


def corr_df3(obj):
    """
    :param obj:
    :type obj: pandas.core.frame.DataFrame
    :return:
    :rtype: pandas.core.frame.DataFrame
    """
    return df3.corrwith(obj)
df1.apply(corr_df3)
df1.apply(lambda x: df3.corrwith(x))
df3.apply(lambda x: df1.corrwith(x))

df3.index
Пример #34
0
    return (group - group.mean()) / group.std()


df_stand = by_industry.apply(zscore)
print(df_stand.groupby(industries).agg(['mean', 'std']))
ind_rank = by_industry.rank(ascending=False)
print(ind_rank.groupby(industries).agg(['min', 'max']))
print(by_industry.apply(lambda x: zscore(x.rank())))

fac1, fac2, fac3 = np.random.rand(3, 1000)
ticker_subset = tickers.take(np.random.permutation(N)[:1000])
port = Series(0.7 * fac1 + 1.2 * fac2 + 0.3 * fac3 + np.random.rand(1000), index=ticker_subset)

factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3}, index=ticker_subset)

print(factors.corrwith(port))
print(pd.ols(y=port, x=factors).beta)


def beta_exposure(chunk, factors=None):
    return pd.ols(y=chunk, x=factors).beta


by_ind = port.groupby(industries)
exposures = by_ind.apply(beta_exposure, factors=factors)
print(exposures.unstack())

data = web.get_data_yahoo('SPY', '2006-01-01','2012-07-27')


px = data['Adj Close']