def corr_vars(self, xarr2): from pandas import DataFrame xarr1 = self._obj.copy() assert ( xarr1.shape == xarr2.shape ), "The input DataArray must be the same size as {}".format(self.name) xarr3 = xarr1[:1].mean("time").copy() t, y, x = xarr1.shape df1 = DataFrame(xarr1.values.reshape(t, y * x)) df2 = DataFrame(xarr2.values.reshape(t, y * x)) dfcor = df1.corrwith(df2).values.reshape(y, x) xarr3.values = dfcor xarr3.attrs["long_name"] = "Correlation of %s and %s" % ( xarr1.name, xarr2.name, ) xarr3.name = "corr_%s_vs_%s" % (xarr1.name, xarr2.name) xarr3.encoding.update({"zlib": True, "shuffle": True, "complevel": 4}) return xarr3
def df_error_analysis(dfA: pd.DataFrame, dfB: pd.DataFrame, **kwargs): """ 两个DataFrame的偏差分析,注意:两个DataFrame的列和索引应该一致 调用案例:df_error_analysis(dfA,dfB,col=['open']) """ col = kwargs.pop('col', None) #皮尔森相关系数 corr_method = kwargs.pop('corr_method', 'pearson') # Find their differences diff = dfA - dfB df = diff.describe() #均方差、平均绝对偏差、平均值的无偏标准误差、皮尔森相关系数 extra = pd.DataFrame([ diff.var(), diff.mad(), diff.sem(), dfA.corrwith(dfB, method=corr_method) ], index=['var', 'mad', 'sem', 'corr']) # Append the differences to the DataFrame df = df.append(extra) if col is not None: return df[col] else: return df
def stocks_corr_analyzation(days, stock, *compare_stocks): ''' analyze the pct_change corr between stocks ''' compare_stock_data = {} for s in compare_stocks: try: compare_stock_data[s.name] = get_recent_data(s.code, s.market_code, days, update=False) except Exception as e: print_err(e) df_compare_stocks_pct = DataFrame( {stock: df['Adj Close'] for stock, df in compare_stock_data.items()}).pct_change() if stock is not None: if stock in compare_stocks: ser_stock_pct = df_compare_stocks_pct[stock.name] df_compare_stocks_pct.drop(columns=stock.name, inplace=True) else: df = get_recent_data(stock.code, stock.market_code, days, update=False) ser_stock_pct = df['Adj Close'].pct_change() ser = df_compare_stocks_pct.corrwith(ser_stock_pct) # 筛选出0.5以上的,并格式化 ser = ser[ser >= 0.5].map(lambda x: '{:.2f}'.format(x)) ser.name = '涨跌相关性' return ser.to_frame() else: df = df_compare_stocks_pct.corr() return df.applymap(lambda x: '{:.2f}'.format(x))
def top_correlated_features(df: DataFrame, target_feature, n=5): """ Returns the names of features most strongly correlated (correlation is close to 1 or -1) with a target feature. Correlation is Pearson's-r sense. :param df: A pandas dataframe. :param target_feature: The name of the target feature. :param n: Number of top features to return. :return: A tuple of - top_n_features: Sequence of the top feature names - top_n_corr: Sequence of correlation coefficients of above features Both the returned sequences should be sorted so that the best (most correlated) feature is first. """ # TODO: Calculate correlations with target and sort features by it # ====== YOUR CODE: ====== table = df.corrwith(other=df[target_feature], method='pearson') table_abs = table.abs() top_n_features = table_abs.nlargest(n=n + 1).sort_values(ascending=False) top_n_features = [feature[0] for feature in top_n_features[1:].items()] top_n_corr = [table.get(f) for f in top_n_features] # ======================== return top_n_features, top_n_corr
def test_corrwith(self, datetime_frame): a = datetime_frame noise = Series(np.random.randn(len(a)), index=a.index) b = datetime_frame.add(noise, axis=0) # make sure order does not matter b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) del b["B"] colcorr = a.corrwith(b, axis=0) tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"])) rowcorr = a.corrwith(b, axis=1) tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) dropped = a.corrwith(b, axis=0, drop=True) tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"])) assert "B" not in dropped dropped = a.corrwith(b, axis=1, drop=True) assert a.index[-1] not in dropped.index # non time-series data index = ["a", "b", "c", "d", "e"] columns = ["one", "two", "three", "four"] df1 = DataFrame(np.random.randn(5, 4), index=index, columns=columns) df2 = DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) correls = df1.corrwith(df2, axis=1) for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row]))
def target_correlation(features: pd.DataFrame, target: DataType, method: str = "pearson") -> pd.Series: """ Calculate target_correlation between features and target and returns a sorted pd.Series Parameters ---------- features: pd.DataFrame Features to calculate target_correlation for target: np.ndarray or pd.Series Target variable method: str Which correlation to use. One of 'pearson', 'spearman', 'kendall' Returns ------- pd.Series Series of feature importance sorted by absolute value """ if isinstance(target, np.ndarray): target = pd.Series(target) corr = features.corrwith(target, method=method) sorted_idx = np.argsort(corr.abs()) return corr[sorted_idx]
def test_corrwith_index_union(self): df1 = DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"]) df2 = DataFrame(np.random.random(size=(10, 3)), columns=["a", "b", "c"]) result = df1.corrwith(df2, drop=False).index.sort_values() expected = df1.columns.union(df2.columns).sort_values() tm.assert_index_equal(result, expected)
def test_corrwith_mixed_dtypes(self, numeric_only): # GH#18570 df = DataFrame({ "a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"] }) s = Series([0, 6, 7, 3]) if numeric_only: result = df.corrwith(s, numeric_only=numeric_only) corrs = [df["a"].corr(s), df["b"].corr(s)] expected = Series(data=corrs, index=["a", "b"]) tm.assert_series_equal(result, expected) else: with pytest.raises(TypeError, match="not supported for the input types"): df.corrwith(s, numeric_only=numeric_only)
def test_corrwith_matches_corrcoef(self): df1 = DataFrame(np.arange(10000), columns=["a"]) df2 = DataFrame(np.arange(10000)**2, columns=["a"]) c1 = df1.corrwith(df2)["a"] c2 = np.corrcoef(df1["a"], df2["a"])[0][1] tm.assert_almost_equal(c1, c2) assert c1 < 1
def cat_corr_heatmap( *, data: pd.DataFrame, categorical: str, transpose: bool = False, high_corr: float = None, scale: float = 0.5, no_prefix: bool = True, ax: plt.Axes = None, **kwargs, ) -> plt.Axes: """Plot a correlation heatmap of categorical vs. numeric features. Args: data (DataFrame): Frame containing categorical and numeric data. categorical (str): Name or list of names of categorical features. high_corr (float): Threshold for high correlation. Defaults to None. scale (float, optional): Multiplier for determining figsize. Defaults to 0.5. no_prefix (bool, optional): If only one cat, do not prefix dummies. Defaults to True. ax (Axes, optional): Axes to plot on. Defaults to None. Returns: Axes: Axes of the plot. """ if isinstance(categorical, str): ylabel = utils.to_title(categorical) categorical = [categorical] single_cat = True else: ylabel = "Categorical Features" single_cat = False title = "Correlation with Numeric Features" cat_df = data.filter(categorical, axis=1) if no_prefix and single_cat: dummies = pd.get_dummies(cat_df, prefix="", prefix_sep="") else: dummies = pd.get_dummies(cat_df) corr_df = dummies.apply(lambda x: data.corrwith(x)) if not transpose: corr_df = corr_df.T if high_corr is not None: if "annot" not in kwargs or kwargs.get("annot"): kwargs["annot"] = corr_df.values corr_df = corr_df.abs() > high_corr kwargs["center"] = None title = f"High {title}" if ax is None: fig, ax = plt.subplots(figsize=figsize_like(corr_df, scale=scale)) style = dict(HEATMAP_STYLE) style.update(kwargs) ax = sns.heatmap(corr_df, ax=ax, **style) xlabel = "Numeric Features" if transpose: xlabel, ylabel = ylabel, xlabel ax.set_xlabel(xlabel, labelpad=10) ax.set_ylabel(ylabel, labelpad=10) ax.set_title(title, pad=10) return ax
def correlate(df: pd.DataFrame, response: str): """Returns the correlation matrix for a dataframe""" df = df[df.columns[df.nunique() > 1]].copy() if response not in df.columns: df[response] = np.nan series = df[response] df = df.drop(columns=[response]) corrdf = df.corrwith(series) return corrdf.reindex(corrdf.abs().sort_values().index)
def test_corrwith_dup_cols(self): # GH#21925 df1 = DataFrame(np.vstack([np.arange(10)] * 3).T) df2 = df1.copy() df2 = pd.concat((df2, df2[0]), axis=1) result = df1.corrwith(df2) expected = Series(np.ones(4), index=[0, 0, 1, 2]) tm.assert_series_equal(result, expected)
def generate_recommendation(movie_matrix: pd.DataFrame, movie_title: str, year: str) -> pd.DataFrame: user_rating = movie_matrix[f'{movie_title} ({year})'] user_rating = user_rating.dropna() similar = movie_matrix.corrwith(user_rating) corr = pd.DataFrame(similar, columns=['correlation']) corr.dropna(inplace=True) return corr
def test_corrwith_mixed_dtypes(self): # GH#18570 df = DataFrame( {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} ) s = Series([0, 6, 7, 3]) result = df.corrwith(s) corrs = [df["a"].corr(s), df["b"].corr(s)] expected = Series(data=corrs, index=["a", "b"]) tm.assert_series_equal(result, expected)
def boolean_violinplots( crosstab: pd.DataFrame, y_series: pd.Series, suptitle: str, xlabels: list = None, ylabel: str = None, include: list = None, figsize: tuple = (12, 8), **kwargs, ) -> np.array: """Create multiple violin plots showing distributions for True and False. Args: crosstab (pd.DataFrame): Crosstab frequency table for categorical variables. y_series (pd.Series): Data for y-axis. suptitle (str): Figure title. xlabels (list, optional): Labels for x-axes. Defaults to None. ylabel (str, optional): Label for y-axis. Defaults to None. include (list, optional): Columns of `crosstab` to plot. Defaults to None. figsize (tuple, optional): Figure size. Defaults to (12, 8). Returns: np.array: Array of Axes. """ ncols = 2 nrows = int(np.ceil(crosstab.shape[1] / 2)) if include: crosstab = crosstab.loc[:, include] nrows = int(np.ceil(len(include) / 2)) corr = crosstab.corrwith(y_series) fig, axs = plt.subplots( nrows=nrows, ncols=ncols, sharey=True, figsize=figsize ) for i, ax in enumerate(axs.flat): ax = sns.violinplot(x=crosstab.iloc[:, i], y=y_series, ax=ax, **kwargs) ax.set_ylabel(None) if xlabels: ax.set_xlabel(xlabel[i]) cat_corr = np.round(corr.iloc[i], 2) text = f"Corr: {cat_corr}" ax.text( 0.975, 1.025, text, horizontalalignment="right", verticalalignment="center", transform=ax.transAxes, fontsize=12, ) if ylabel: for ax in axs[:, 0]: ax.set_ylabel(ylabel, labelpad=10) fig.suptitle(suptitle) fig.tight_layout() return axs
def __init__(self, data: pd.DataFrame, scores: pd.DataFrame, weights: pd.DataFrame, odm: pd.DataFrame, r_squared: pd.Series): self.__crossloadings = scores.apply(lambda s: data.corrwith(s)) loading = (self.__crossloadings * odm).sum(axis=1).to_frame(name="loading") communality = loading.apply(lambda s: pow(s, 2)) communality.columns = ["communality"] r_squared_aux = odm.dot(pd.DataFrame(np.diag(r_squared), index=r_squared.index, columns=r_squared.index)).sum( axis=1).to_frame(name="communality") redundancy = communality * r_squared_aux redundancy.columns = ["redundancy"] self.__outer_model = pd.concat([weights, loading, communality, redundancy], axis=1, sort=True)
def correlate(df: pd.DataFrame, response: str) -> pd.Series: """Returns the correlation matrix for a dataframe This function is the same as in ParameterAnalysis and could be generalized """ df = df[df.columns[df.nunique() > 1]].copy() if response not in df.columns: df[response] = np.nan series = df[response] df = df.drop(columns=[response]) corrdf = df.corrwith(series) return corrdf.reindex(corrdf.abs().sort_values().index)
def correlate_response_with_dataframe( df: pd.DataFrame, response: str, corrwith: Optional[list] = None) -> pd.Series: """Returns the correlation matrix for a dataframe soprted by correlation""" df = df[corrwith + [response]].copy() if corrwith is not None else df.copy() df = df[df.columns[df.nunique() > 1]] if response not in df.columns: df[response] = np.nan series = df[response] df = df.drop(columns=[response]) corrdf = df.corrwith(series) return corrdf.reindex(corrdf.abs().sort_values().index)
def cat_correlation(crosstab: pd.DataFrame, other: pd.Series, **kwargs) -> Axes: """Make a heated bar plot of the correlation between a crosstab and `other`. Args: crosstab (pd.DataFrame): Crosstab frequency table for categorical variables. other (pd.Series): Data for correlation. Must share index with `crosstab`. Returns: Axes: Axes for the plot. """ corr = crosstab.corrwith(other).dropna().sort_values(ascending=False) ax = heated_barplot(corr, **kwargs) ax.set_xlabel("Correlation", labelpad=15) return ax
def corr(self, item_name: str, comparanda: DataFrame) -> DataFrame: """ Returns correlations between a supplied vector of a given item and those of all other items. :param item_name: name of item :param comparanda: DataFrame consisting of numerical vectors each associated to a particular item (e.g., item_matrix_training if we're comparing user ratings of differnet items, and latent_content_features if we're comparing latent content features of different items) :param ratings: architecture-specific ratings """ #vector of ratings for input item item_ratings = comparanda[item_name] #checks if entire item_ratings vector originated with empty values (i.e., every value is 2.5 at this point), set all values of output to 0 if so if (item_ratings == 2.5).all(): similarity_vector = DataFrame({ 'Title': comparanda.columns, 'Ratings_count': pd.Series(self.ratings['Number_of_ratings'][x] for x in comparanda.columns) }) similarity_vector['Similarity'] = pd.Series( [0 for x in range(len(similarity_vector.index))], index=similarity_vector.index) #else compute matrix of correlations between given item and other items, minus missing values else: similarity_vector = DataFrame({ 'Title': comparanda.columns, 'Similarity': comparanda.corrwith(item_ratings), 'Ratings_count': pd.Series(self.ratings['Number_of_ratings'][x] for x in comparanda.columns) }) similarity_vector = similarity_vector.sort_values( by=["Similarity", "Ratings_count"], ascending=False) similarity_vector = similarity_vector[1:] return similarity_vector
def broad_corr(frame: pd.DataFrame, other: pd.DataFrame) -> pd.DataFrame: """Get correlations between features of one frame with those of another. Parameters ---------- frame : DataFrame First DataFrame. other : DataFrame Second DataFrame. Returns ------- DataFrame Pearson correlations. """ return other.apply(lambda x: frame.corrwith(x))
def corr_vars(xarr1, xarr2): from pandas import DataFrame xarr3 = xarr1[:1].mean('time').copy() t, y, x = xarr1.shape df1 = DataFrame(xarr1.values.reshape(t, y * x)) df2 = DataFrame(xarr2.values.reshape(t, y * x)) dfcor = df1.corrwith(df2).values.reshape(y, x) xarr3.values = dfcor xarr3.attrs['long_name'] = 'Correlation of %s and %s' % (xarr1.name, xarr2.name) xarr3.name = 'corr_%s_vs_%s' % (xarr1.name, xarr2.name) xarr3.encoding.update({'zlib': True, 'shuffle': True, 'complevel': 4}) return xarr3
def cross_correlate(df: pd.DataFrame, relate_to_series: pd.Series, lag_idx=0): """Calculate cross correlation for a given lag. It is recommended to either have a lot of data in the data frame, or to use a short time frame for the lags, as the results are unstable if too few data points overlap in the time shifted time series. Args: df (pandas.Series): Time series data to correlate with some series relate_to_series (pandas.Series): Pandas Series with time series data to relate df to. Must have the same temporal spacing as df. lag_idx (int): How many indices to move the DataFrame in relation to the series. Returns: pandas.DataFrame: Pandas DataFrame containing the cross correlations of the columns. Examples: >>> df = pd.DataFrame({'x': (1, 7, 3, 5), 'y': (3, 7, 6, 4)}) >>> cross_correlate(df, df['x'], lag_idx=1) x -0.981981 y -0.960769 """ correlations = df.corrwith(relate_to_series.shift(lag_idx)) return correlations
def corr_vars(self, xarr2): from pandas import DataFrame xarr1 = self._obj.copy() assert xarr1.shape == xarr2.shape, 'The input DataArray must be the same size as {}'.format( self.name) xarr3 = xarr1[:1].mean('time').copy() t, y, x = xarr1.shape df1 = DataFrame(xarr1.values.reshape(t, y * x)) df2 = DataFrame(xarr2.values.reshape(t, y * x)) dfcor = df1.corrwith(df2).values.reshape(y, x) xarr3.values = dfcor xarr3.attrs['long_name'] = 'Correlation of %s and %s' % (xarr1.name, xarr2.name) xarr3.name = 'corr_%s_vs_%s' % (xarr1.name, xarr2.name) xarr3.encoding.update({'zlib': True, 'shuffle': True, 'complevel': 4}) return xarr3
def corr_mat( data: pd.DataFrame, split: Optional[ str] = None, # Optional[Literal['pos', 'neg', 'high', 'low']] = None, threshold: float = 0, target: Optional[Union[pd.DataFrame, pd.Series, np.ndarray, str]] = None, method: str = "pearson", # Literal['pearson', 'spearman', 'kendall'] = "pearson", colored: bool = True, ) -> Union[pd.DataFrame, Any]: """ Returns a color-encoded correlation matrix. Parameters ---------- data : pd.DataFrame 2D dataset that can be coerced into Pandas DataFrame. If a Pandas DataFrame is provided, the \ index/column information is used to label the plots split : Optional[str], optional Type of split to be performed, by default None {None, "pos", "neg", "high", "low"} threshold : float, optional Value between 0 and 1 to set the correlation threshold, by default 0 unless split = "high" \ or split = "low", in which case default is 0.3 target : Optional[Union[pd.DataFrame, str]], optional Specify target for correlation. E.g. label column to generate only the correlations between each \ feature and the label, by default None method : str, optional method: {"pearson", "spearman", "kendall"}, by default "pearson" * pearson: measures linear relationships and requires normally distributed and homoscedastic data. * spearman: ranked/ordinal correlation, measures monotonic relationships. * kendall: ranked/ordinal correlation, measures monotonic relationships. Computationally more \ expensive but more robust in smaller dataets than "spearman" colored : bool, optional If True the negative values in the correlation matrix are colored in red, by default True Returns ------- Union[pd.DataFrame, pd.Styler] If colored = True - corr: Pandas Styler object If colored = False - corr: Pandas DataFrame """ # Validate Inputs _validate_input_range(threshold, "threshold", -1, 1) _validate_input_bool(colored, "colored") def color_negative_red(val): color = "#FF3344" if val < 0 else None return "color: %s" % color data = pd.DataFrame(data) if isinstance(target, (str, list, pd.Series, np.ndarray)): target_data = [] if isinstance(target, str): target_data = data[target] data = data.drop(target, axis=1) elif isinstance(target, (list, pd.Series, np.ndarray)): target_data = pd.Series(target) target = target_data.name corr = pd.DataFrame(data.corrwith(target_data, method=method)) corr = corr.sort_values(corr.columns[0], ascending=False) corr.columns = [target] else: corr = data.corr(method=method) corr = _corr_selector(corr, split=split, threshold=threshold) if colored: return corr.style.applymap(color_negative_red).format("{:.2f}", na_rep="-") else: return corr
'Value': np.random.randn(M) / 200 + 0.08, 'ShortInterest': np.random.randn(M) / 200 - 0.02 }, index=tickers[:M]) ind_names = np.array(['FINANCIAL', 'THCH']) sampler = np.random.randint(0, len(ind_names), N) industries = Series(ind_names[sampler], index=tickers, name='industry') by_industry = df.groupby(industries) df_stand = by_industry.apply(zscore) ind_rank = by_industry.rank(ascending=False) # print(tickers) # print(df) # print(industries) # print(by_industry.mean()) # print(by_industry.describe()) # print(ind_rank) # print(df_stand.groupby(industries).agg(['mean', 'std'])) fac1, fac2, fac3 = np.random.rand(3, 1000) ticker_subset = tickers.take(np.random.permutation(N)[:1000]) port = Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + np.random.rand(1000), index=ticker_subset) factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3}, index=ticker_subset) print(factors.corrwith(port)) # print(pd.ols(y=port, x=factors).beta) print(factors.tail())
# descriptive statistics df = DataFrame([[1.4,np.nan],[7.1,-4.5], [np.nan,np.nan],[0.75,-1.3]], index=list('abcd'), columns=['one','two']) df.describe() # skipna=True, mean, std, var, sum, # max, min, argmax, argmin, idxmax, idxmin, # cumsum, cumprod, diff, pct_change # Correlation and Covariance df = DataFrame(np.random.randn(100,3), columns=list('abc')) df.corr() df.cov() df.corrwith(df['a']) # unique values, value counts, membership obj = Series(['c','a','d','a','a','b','b','c','c']) uniques = obj.unique() obj.value_counts() mask = obj.isin(['b','c']) obj[mask] # deal with missing data df = DataFrame(np.random.randn(7,3)) df.ix[:4,1] = np.nan; df.ix[:2,2] = np.nan df.dropna(thresh=3) df.fillna(0) df.fillna({1:0.5,3:-1})
def _pc_volume_corr(df_basic_data): '''calculate the corr between price pct change and volume pct change''' pct_change = DataFrame(df_basic_data['Adj Close'].pct_change()) return pct_change.corrwith(df_basic_data['Volume'].pct_change())
frame.ix['f'] = np.random.randn(4) frame['loc'] = ['ST', 'MO'] * 3 frame.sort_index(axis=1) frame.sort_values(by=['loc', 'STL']) frame.rank(axis=0) frame.rank(method='max') um.order() um.rank() frame.add(frame2) frame.corr(um) frame.fillna(1, inplace='True') um = frame['UM'] frame.corr() frame.cov() frame2.ix['f'] = np.random.randn(3) frame.corrwith(frame2) frame.corrwith(um) frame.corrwith(um.to_frame()) frame.ix[:, 'Washu':'UMST'].apply(lambda x: x.mean()) frame.set_index('UM', drop=True, inplace=True) keys = frame.index frame.reset_index(level=keys) df = DataFrame(np.random.randn(6, 5), columns=['Ohio', 'Dallas', 'Michigan', 'Miami', 'DC'], index=[['a', 'a', 'b', 'b', 'c', 'd'], [1, 2, 3, 1, 2, 3]]) df.index df.ix['a'] df.sortlevel(level=0, axis=0) df.sortlevel(level=1, axis=0) df.swaplevel(0, 1)
N = 1000 tickers = np.array([rands(5) for _ in xrange(N)]) # 下面是一个投资组合, 由三个随机生成的因子(通常称为因子载荷)和一些权重构成 fac1, fac2, fac3 = np.random.rand(3, 1000) ticker_subset = tickers.take(np.random.permutation(N)[:1000]) # 因子加权和以及噪声 port = Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + np.random.rand(1000), index=ticker_subset) factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3}, index=ticker_subset) print factors.corrwith(port) # 计算因子暴露的标准方式是最小二乘回归 # 使用pandas.ols来计算整个投资组合的暴露 print pd.ols(y=port, x=factors).beta # 通过groupby计算各行业的暴露量 def beta_exposure(chunk, factors=None): return pd.ols(y=chunk, x=factors).beta ind_names = np.array(['FINANCIAL', 'TECH']) sampler = np.random.randint(0, len(ind_names), N) industries = Series(ind_names[sampler], index=tickers, name='industry')
def test_corrwith_kendall(self): # GH#21925 df = DataFrame(np.random.random(size=(100, 3))) result = df.corrwith(df**2, method="kendall") expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected)
def display(): av = avogadro # 6.02214 * pow(10, 3) grav = gravity # 9.8 Newton surface_pressure = 9.96921e+36 # initialization value from Netcdf files (variable: FillValue = 9.96921e+36) air_mass_kg = 0.0289654 # constant from : (https://en.wikipedia.org/wiki/Density_of_air) multiplication_factor_to_convert_to_molecules_percm2 = 6.022141E19 november = 11 december = 12 ################################# Reading OGC standardized json files #################################### SP5 = input("Enter Sentinel-5P JSON-File Path:") EPA = input("Enter EPA JSON-File Path:") series_sp5 = pd.read_json(SP5) series_epa = pd.read_json(EPA) #################################Choose Output file of merged Data################################################## output = input("Enter Merged Output File Path:") #####################################Convert EPA timeseries Data to Dataframes ##################################### for series_name in series_epa: pollutant_name = series_name var_epa = series_epa[pollutant_name] time_array_epa = [] air_quality_value_epa = [] for key in var_epa.keys(): for (subkey, value) in zip(var_epa[key].keys(), var_epa[key].values()): if (subkey == "time"): time_array_epa.append(value["instant"]) elif (subkey == "value"): air_quality_value_epa.append(value) data_epa = { 'Date_Value': time_array_epa, 'Pollutant_Value': air_quality_value_epa } json_to_dataframe_epa = DataFrame( data_epa, columns=['Date_Value', 'Pollutant_Value']) json_to_dataframe_epa.Date_Value = pd.to_datetime( json_to_dataframe_epa.Date_Value, utc=True) json_to_dataframe_epa.Date_Value = pd.to_datetime( json_to_dataframe_epa.Date_Value, unit="s") # convert string pollutant value to numeric json_to_dataframe_epa.Pollutant_Value = pd.to_numeric( json_to_dataframe_epa.Pollutant_Value) # epa data in hours json_to_dataframe_epa = json_to_dataframe_epa.set_index("Date_Value") #####################################Convert Sentinel 5-P timeseries Data to Dataframes ############################ for series_name in series_sp5: pollutant_name = series_name var_sp5 = series_sp5[pollutant_name] time_array_sp5 = [] air_quality_value_sp5 = [] for key in var_sp5.keys(): for (subkey, value) in zip(var_sp5[key].keys(), var_sp5[key].values()): if (subkey == "time"): time_array_sp5.append(value["instant"]) elif (subkey == "surface_pressure"): surface_pressure = value["value"] elif (subkey == "value"): if (value != ""): ##### Unit Conversion of Data from mol/m2 to ppb following TOBIAS criteria ######## ############ Link (https://search.proquest.com/docview/2117060744 ################# air_column = (float(surface_pressure) * avogadro) / (gravity * air_mass_kg) value = multiplication_factor_to_convert_to_molecules_percm2 * float( value) / air_column / 1E9 air_quality_value_sp5.append(value) data_sp5 = { 'Date_Value': time_array_sp5, 'Pollutant_Value': air_quality_value_sp5 } json_to_dataframe_sp5 = DataFrame( data_sp5, columns=['Date_Value', 'Pollutant_Value']) json_to_dataframe_sp5.Date_Value = pd.to_datetime( json_to_dataframe_sp5.Date_Value, utc=True) json_to_dataframe_sp5.Date_Value = pd.to_datetime( json_to_dataframe_sp5.Date_Value, unit="s") # convert string pollutant value to numeric json_to_dataframe_sp5.Pollutant_Value = pd.to_numeric( json_to_dataframe_sp5.Pollutant_Value) # s5p one value daily in seconds json_to_dataframe_sp5 = json_to_dataframe_sp5.set_index( "Date_Value").resample('D').mean() ############################ THE MERGE OF THE Sentinel5-P and EPA DATASETS######################################################## combined_dataframe = json_to_dataframe_sp5.combine_first( json_to_dataframe_epa).sort_values( 'Date_Value' ) # sorted and null in SP5 is filled with values from EPA #################################################Write Merged Data to File########################################## outfile = open(output, 'w') combined_dataframe.to_csv(outfile, sep='\t') ###########################################Plot of autocorrelation of merged dataset ############################### # lags = 28 is how many timesteps to be included in the plot # The auto correlation functions for the pollution and its first ,second and third order differences # axes parameter is used for number of graphs drawn ..first plot in axes 0, second in axes 1,..etc # To make an AR model it is important to check that it doesn't have auto correlation or autocorrelation decay # It should be to stationary process # plot with increasingly time delays series_1 = combined_dataframe[combined_dataframe.index.month == november] series_2 = DataFrame( combined_dataframe[combined_dataframe.index.month == december]) fig, axes = plt.subplots(1, 3, figsize=(12, 3)) #smg.tsa.plot_acf(series_1.Pollutant_Value, lags=28, ax=axes[0]) smg.tsa.plot_acf(series_1.Pollutant_Value.diff().dropna(), lags=28, ax=axes[0]) smg.tsa.plot_acf(series_1.Pollutant_Value.diff().diff().dropna(), lags=28, ax=axes[1]) smg.tsa.plot_acf(series_1.Pollutant_Value.diff().diff().diff().dropna(), lags=28, ax=axes[2]) plt.show() ##################### To be sure there are no invalid values and to drop them if exist############################## series_1[~series_1.isin([np.nan, np.inf, -np.inf]).any(1)] series_1.replace([np.inf, -np.inf], np.nan).dropna(axis=1) model = sm.tsa.AR(series_1.Pollutant_Value.dropna()) result = model.fit(288) # fit to 3 days as the frequency is D ############ Use the result of Durbin Watson to show that there is no significant auto-correlation ################# sm.stats.durbin_watson(result.resid) print("Durbin Watson") print(sm.stats.durbin_watson(result.resid)) # plot the result of Durbin Watson to show that there is no significant auto-correlation fig, ax = plt.subplots(1, 1, figsize=(8, 3)) # smg.tsa.plot_acf(result.resid, lags=72, ax=ax) for hours smg.tsa.plot_acf(result.resid, lags=27, ax=ax) plt.show() ####################################Air Quality Forecast############################################################ air_quality_forecast = result.predict( start=288, end=480, dynamic=False) # try for the modelling fig, ax = plt.subplots(1, 1, figsize=(12, 4)) ax.plot(series_1.index.values[-288:], series_1.Pollutant_Value.values[-288:], label="train data") ax.plot(series_2.index.values[:288], series_2.Pollutant_Value.values[:288], label="actual data") ax.plot(pd.date_range("2018-12-01 00:00:00+00:00", "2018-12-03 00:00:00+00:00", freq="15min").values, air_quality_forecast, label="predicted outcome") sample = series_2['2018-12-01 00:00:00+00:00':'2018-12-03 00:00:00+00:00'] ####################################Air Quality Forecast############################################################ print("RMSE") print(sqrt(mean_squared_error(sample, air_quality_forecast))) #####################################Check Autocorrelation SP5 and EPA############################################## ax.legend() plt.show() print("Correlation sp5 and epa") print( json_to_dataframe_sp5.corrwith(json_to_dataframe_epa, method='pearson'))
:param df1: :type df1: pandas.core.frame.DataFrame :param df2: :type df2: pandas.core.frame.DataFrame :return: :rtype: pandas.core.frame.DataFrame """ res = [] for i in range(df2.shape[1]): res.append(df1.corrwith(df2.ix[:, i])) res = pd.concat(res, axis=1) res.columns = df2.columns return res pairwise_corr(df1, df3) df1.corrwith(df3.h) def corr_df3(obj): """ :param obj: :type obj: pandas.core.frame.DataFrame :return: :rtype: pandas.core.frame.DataFrame """ return df3.corrwith(obj) df1.apply(corr_df3) df1.apply(lambda x: df3.corrwith(x)) df3.apply(lambda x: df1.corrwith(x)) df3.index
return (group - group.mean()) / group.std() df_stand = by_industry.apply(zscore) print(df_stand.groupby(industries).agg(['mean', 'std'])) ind_rank = by_industry.rank(ascending=False) print(ind_rank.groupby(industries).agg(['min', 'max'])) print(by_industry.apply(lambda x: zscore(x.rank()))) fac1, fac2, fac3 = np.random.rand(3, 1000) ticker_subset = tickers.take(np.random.permutation(N)[:1000]) port = Series(0.7 * fac1 + 1.2 * fac2 + 0.3 * fac3 + np.random.rand(1000), index=ticker_subset) factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3}, index=ticker_subset) print(factors.corrwith(port)) print(pd.ols(y=port, x=factors).beta) def beta_exposure(chunk, factors=None): return pd.ols(y=chunk, x=factors).beta by_ind = port.groupby(industries) exposures = by_ind.apply(beta_exposure, factors=factors) print(exposures.unstack()) data = web.get_data_yahoo('SPY', '2006-01-01','2012-07-27') px = data['Adj Close']