def test_cython_api2(): # this takes the fast apply path # cumsum (GH5614) df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"]) expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"]) result = df.groupby("A").cumsum() tm.assert_frame_equal(result, expected) # GH 5755 - cumsum is a transformer and should ignore as_index result = df.groupby("A", as_index=False).cumsum() tm.assert_frame_equal(result, expected) # GH 13994 result = df.groupby("A").cumsum(axis=1) expected = df.cumsum(axis=1) tm.assert_frame_equal(result, expected) result = df.groupby("A").cumprod(axis=1) expected = df.cumprod(axis=1) tm.assert_frame_equal(result, expected)
def pandas_df_demo06(): # 1 df = DataFrame(np.arange(9).reshape(3, 3), index=['bj', 'sh', 'gz'], columns=['a', 'b', 'c']) print('dataframe:\n', df) df.index = Series(['beijing', 'shanghai', 'guangzhou']) print('\nupdate index df:\n', df) df.index = df.index.map(str.upper) print('\nupdate index with upper df:\n', df) df1 = df.rename(index=str.lower, columns=str.upper) print('\nupdate index and cols df:\n', df1) # 2 df2 = DataFrame([ [2.0, 1.0, 3.0, 5], [3.0, 4.0, 5.0, 5], [3.0, 4.0, 5.0, 5], [1.0, 0.0, 6.0, 5]], columns=list('abcd')) print('\nsum by rows df:\n', df2.cumsum(axis=0)) # default print('\nsum by cols df:\n', df2.cumsum(axis=1))
def test_cython_api2(): # this takes the fast apply path # cumsum (GH5614) df = DataFrame( [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9] ], columns=['A', 'B', 'C']) expected = DataFrame( [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) result = df.groupby('A').cumsum() tm.assert_frame_equal(result, expected) # GH 5755 - cumsum is a transformer and should ignore as_index result = df.groupby('A', as_index=False).cumsum() tm.assert_frame_equal(result, expected) # GH 13994 result = df.groupby('A').cumsum(axis=1) expected = df.cumsum(axis=1) tm.assert_frame_equal(result, expected) result = df.groupby('A').cumprod(axis=1) expected = df.cumprod(axis=1) tm.assert_frame_equal(result, expected)
# 一个索引有多个值,那么该索引就会返回多个值。 obj['a'] ## 汇总和计算描述统计 df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]], index=['a','b','c','d'], columns=['one','two']) # 对列 df.sum() # 对行 df.sum(axis=1) # 默认会排除NA,但是可以通过skipna禁用该功能 df.mean(axis=1,skipna=False) # 返回最大值的索引 df.idxmax() # 累加 df.cumsum() df.describe() # 相关系数 returns.MSFT.corr(returns.IBM) returns.corr() returns.cov() returns.corrwith(returns.IBM) ## 唯一值,值计数以及成员资格 obj = Series(['c','a','d','a','a','b','b','c','c']) uniques = obj.unique() # 统计个数 obj.value_counts() # 统计个数后默认排序,也可以不排序 pd.value_counts(obj.values, sort=False) # 判断是否存在
import numpy as np import pandas as pd from numpy.random import randn from pandas import Series,DataFrame import matplotlib.pyplot as plt array1=np.array([[10,np.nan,20],[30,40,np.nan]]) print(array1) df1=DataFrame(array1,index=[1,2],columns=list('ABC')) print(df1) print(df1.sum()) print(df1.sum(axis=1)) print(df1.min()) print(df1.max()) print(df1.idxmax()) print(df1.cumsum()) print(df1.describe()) df2=DataFrame(randn(9).reshape(3,3),index=[1,2,3],columns=list('ABC')) print(df2) plt.plot(df2) plt.legend(df2.columns,loc="lower right") plt.savefig("samplepic.png") plt.show() ser=Series(list('abcccaabd')) print(ser) print(ser.unique()) print(ser.value_counts())
###################################################################### # Will return the sum for each colm, ignores NaN dframe.sum() # Will return sum of Rows dframe.sum(axis=1) # Min/max val for each col dframe.min() # Min/max val index for each col dframe.idxmin() # Cumulation sum dframe.cumsum() # Describe method creates summary statistics for each colm dframe.descirbe() #count, mean, std, min, .... # Covariance and Correlation import pandas.io.data as pdweb import datetime # Getting the stock data from the internet and displaying the first 5 sets prices = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'],start=datetime.datetime(2010,1,1), end=datetime.datetime(2013,1,1))['Adj Close'] prices.head()
def test_cumsum_corner(self): dm = DataFrame(np.arange(20).reshape(4, 5), index=range(4), columns=range(5)) # TODO(wesm): do something with this? result = dm.cumsum() # noqa
# In[ ]: df.idxmax() #各列最大值的index # In[ ]: df # In[ ]: df.cumsum() #每列按行累加 # In[ ]: df.describe() #给出常见的统计量 # In[ ]: s = Series(['a', 'a', 'b', 'c'] * 4) s
df ''' one two a 1.0 NaN b 7.0 4.0 c NaN NaN d 0.0 1.0 ''' print df.idxmax() # 计算每一列最大值的索引 ''' one b two b ''' print df.cumsum() # 每一列的累加和 ''' one two a 1.0 NaN b 8.0 4.0 c NaN NaN d 8.0 5.0 ''' print df.describe() # 对DataFrame每列计算汇总统计 ''' one two count 3.000000 2.00000 mean 2.666667 2.50000 std 3.785939 2.12132 min 0.000000 1.00000
# -*- coding:utf-8 -*- import numpy as np import pandas as pd from pandas import DataFrame, Series import matplotlib.pyplot as plt from numpy.random import randn ts = pd.Series(randn(52), \ index=pd.date_range('1/1/2016', periods=52, freq='W')) df = DataFrame(randn(52,5), \ index=ts.index, \ columns=list('ABCDE')) df.cumsum().plot()
# Sum method dframe1.sum() # ignores null values (treats them as 0s) dframe1.sum(axis=1) # sum across rows # Min method dframe1.min() # finds the minimum value in each column dframe1.min(axis=1) # minimum value of each row dframe1.idxmin() # Find the index of minimum value column # Max method dframe1.max() dframe1.idxmax() # Cumulative sum dframe1.cumsum() # accumulates along each columns values # Describe method dframe1.describe() # summary statistics of dataframe (by columns) # correlation and covariance import pandas.io.data as pdweb # import pandas_datareader.data as pdweb import datetime prices = pdweb.get_data_yahoo( ["CVX", "XOM", "BP"], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1) )["Adj Close"] prices.head()
##### Summary arr = np.array([[1,2,np.nan],[np.nan,3,4]]) arr df1 = DataFrame(arr,index = ['a','b'],columns = ['one','two','three']) df1 df1.sum() #default axis is 0, and Pandas will ignore the nan values df1.sum(axis=1) #by row df1 df1.min() df1.min(axis=1) df1.idxmin() df1.idxmin(axis=1) df1.cumsum() # accumulation sum ### unique() and value_count() methods for factor variables ser1 = Series(['w','w','x','y','z','w','x','y','x','a']) ser1 ser1.unique() ser1.value_counts() ###describe method df1.describe() # similar to the summary() method in R will provide summary stat. ###covariance matrices and some visulaization import pandas.io.data as pdweb import datetime prices = pdweb.get_data_yahoo(['CVX','XOM','BP'],start = datetime.datetime(2010,1,1),
__author__ = 'Executor' import numpy as np import pandas as pa from pandas import Series, DataFrame arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) dframe1 = DataFrame(arr, index=['A', 'B'], columns=['One', 'Two', 'Three']) print(dframe1.sum()) print(dframe1.sum(axis=1)) print(dframe1.min()) print(dframe1) print(dframe1.idxmin()) print(dframe1) print(dframe1.cumsum()) print(dframe1.describe()) from IPython.display import YouTubeVideo YouTubeVideo('xGbpuFNR1ME') YouTubeVideo('4EXNedimDMs') ''' stupid thing doesn't work!'''
# -*- coding: utf-8 -*- import numpy as np from pandas import Series, DataFrame print '求和' df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index = ['a', 'b', 'c', 'd'], columns = ['one', 'two']) print df print df.sum() # 按列求和 print df.sum(axis = 1) # 按行求和 print print '平均数' print df.mean(axis = 1, skipna = False) print df.mean(axis = 1) print print '其它' print df.idxmax() print df.cumsum() print df.describe() obj = Series(['a', 'a', 'b', 'c'] * 4) print obj.describe()
columns=['one','two'] ) print(df) print('\n') print(df.sum()) print('\n') print(df.sum(axis=1)) print('\n') print(df.mean()) print('\n') print(df.mean(axis=1,skipna=False)) print('\n') print(df.idxmax()) print('\n') print(df.cumsum()) print('\n') print(df.cumsum(axis=1)) print('\n') print(df.describe()) print('\n') ############################################################### obj = Series(['a','a','b','c']*4) print(obj) print(obj.describe()) print('\n') ###############################################################
import numpy as np import pandas as pd import IPython from pandas import Series, DataFrame arr = np.array([[1, 2, np.nan], [np.nan, 3, 4]]) dframe1 = DataFrame(arr, index=['A', 'B'], columns=['one', 'two', 'three']) print(dframe1) print(dframe1.sum()) # sums values across each column print( dframe1.sum(axis=0) ) # sums the value across each row. for Row use axis =1 , for column use axis =0 print(dframe1.min()) # returns min value in column print(dframe1.max()) # returns max value in column print(dframe1.idxmin()) # returns index of the min value in column print( dframe1.cumsum()) #accumulation row wise cumulative summin across column. print(dframe1.describe() ) #summary statiscs for data frame . Min , Max , ount , percentile # from IPython.display import YouTubeVideo # YouTubeVideo('xGbpuFNR1ME') # YouTubeVideo('4EXNedimDMs') from pandas_datareader import data #allow us to get some information from the web import datetime # Library for date input import matplotlib.pyplot as plt import seaborn as sns #%matplotlib inline prices = data.get_data_yahoo( ['CVX', 'XOM', 'BP'], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1))['Adj Close'] #get stock price at that time
print '其它函数' print df ''' one two a 1.0 NaN b 7.0 4.0 c NaN NaN d 0.0 1.0 ''' print df.idxmax() # 计算每一列最大值的索引 ''' one b two b ''' print df.cumsum() # 每一列的累加和 ''' one two a 1.0 NaN b 8.0 4.0 c NaN NaN d 8.0 5.0 ''' print df.describe() # 对DataFrame每列计算汇总统计 ''' one two count 3.000000 2.00000 mean 2.666667 2.50000 std 3.785939 2.12132 min 0.000000 1.00000 25% NaN NaN
#Notice how it ignores NaN values dframe1.sum(axis=1) #Can also grab min and max values of dataframe dframe1.min() #As well as there index dframe1.idxmin() dframe1.idxmax() dframe1.max() dframe1 #Can also do an accumulation sum dframe1.cumsum() #A very useful feature is describe, which provides summary statistics describe=dframe1.describe() # We can also get information on correlation and covariance #For more info on correlation and covariance, check out the videos below! from IPython.display import YouTubeVideo YouTubeVideo('xGbpuFNR1ME') #Now lets check correlation and covariance on some stock prices! #Pandas can get info off the web import pandas_datareader as pdweb #workaround pandas ver 0.24.2 import datetime
def plotter(plot_dir: Path, location_id: int, location_name: str, input_data: Dict, sero_data: pd.DataFrame, ratio_model_inputs: Dict, cross_variant_immunity: List[int], escape_variant_prevalence: pd.Series, output_data: Dict, smooth_infections: pd.Series, output_draws: pd.DataFrame, population: float, measures: List[str] = ['cases', 'hospitalizations', 'deaths']): start_date, end_date = get_dates(input_data, output_data, output_draws) n_cols = 3 n_rows = 12 widths = [2, 1, 2] heights = [1] * n_rows sns.set_style('whitegrid') fig = plt.figure(figsize=(16, 9), constrained_layout=True) gs = fig.add_gridspec(n_rows, n_cols, width_ratios=widths, height_ratios=heights) # line1 = plt.Line2D((0.41, 0.41),(0., 0.975), color='darkgrey', linewidth=2) # line2 = plt.Line2D((0.65, 0.65),(0., 0.975), color='darkgrey', linewidth=2) # fig.add_artist(line1) # fig.add_artist(line2) for i, measure in enumerate(measures): daily_ax = fig.add_subplot(gs[i * 4:i * 4 + 4, 0]) cumul_ax = fig.add_subplot(gs[i * 4:i * 4 + 2, 1]) if measure in list(input_data.keys()): if i == 0: daily_title = 'Daily' cumul_title = 'Cumulative (in thousands)' else: daily_title = None cumul_title = None daily_title = None cumul_title = None data_plot(daily_ax, measure.capitalize(), 'Daily', input_data[measure]['daily'][1:], output_data[measure]['daily'][1:], MEASURE_COLORS[measure]['light'], MEASURE_COLORS[measure]['dark'], start_date, end_date, measure == measures[-1]) data_plot(cumul_ax, None, 'Cumulative', input_data[measure]['cumul'], output_data[measure]['cumul'], MEASURE_COLORS[measure]['light'], MEASURE_COLORS[measure]['dark'], start_date, end_date) else: daily_ax.axis('off') cumul_ax.axis('off') ratio_names = {'deaths': 'IFR', 'hospitalizations': 'IHR', 'cases': 'IDR'} for i, measure in enumerate(measures): ratio_ax = fig.add_subplot(gs[i * 4 + 2:i * 4 + 4, 1]) if measure in list(input_data.keys()): adj_ratio = smooth_infections.copy() adj_ratio.index += pd.Timedelta( days=int(np.mean(input_data[measure]['lags']))) adj_ratio = (output_data[measure]['daily'] * input_data[measure]['scalar'].mean()) / adj_ratio adj_ratio = adj_ratio.dropna() ratio_data = pd.concat([ input_data[measure]['ratio'].groupby(level=1).mean(), input_data[measure]['daily'] ], axis=1).dropna()['ratio'] ratio_data_fe = pd.concat([ input_data[measure]['ratio'].groupby(level=1).mean(), input_data[measure]['daily'] ], axis=1).dropna()['ratio_fe'] ratio_plot_range = pd.concat([ ratio_data, ratio_data_fe, ratio_model_inputs[measure]['ratio_mean'] ]) ratio_plot_range = ratio_plot_range.replace((-np.inf, np.inf), np.nan).dropna() ratio_plot_range_min = ratio_plot_range.min() ratio_plot_range_max = ratio_plot_range.max() ratio_plot_lims = (max( 0, ratio_plot_range_min - ratio_plot_range_max * 0.2), min( 1, ratio_plot_range_max + ratio_plot_range_max * 0.2)) # if ratio_names[measure] == 'IFR': # adj_ratio[adj_ratio < ratio_plot_lims[0]] = np.nan # adj_ratio[adj_ratio > ratio_plot_lims[1]] = np.nan # elif ratio_names[measure] == 'IHR': # adj_ratio[adj_ratio < ratio_plot_lims[0]] = np.nan # adj_ratio[adj_ratio > ratio_plot_lims[1]] = np.nan # elif ratio_names[measure] == 'IDR': # adj_ratio[adj_ratio < 0] = np.nan # adj_ratio[adj_ratio > 1] = np.nan # else: # raise ValueError('Unexpected ratio present in plotting.') ratio_plot(ratio_ax, ratio_plot_lims, ratio_names[measure], ratio_data, ratio_data_fe, adj_ratio, ratio_model_inputs[measure], MEASURE_COLORS[measure]['light'], MEASURE_COLORS[measure]['dark'], start_date, end_date, measure == measures[-1]) else: ratio_ax.axis('off') model_measures = [m for m in measures if m in list(output_data.keys())] #whitespace_top = fig.add_subplot(gs[0:1, 2]) #whitespace_top.axis('off') gs[i * 4:i * 4 + 4, 0] dailymodel_ax = fig.add_subplot(gs[0:4, 2]) infection_daily_data = { mm: pd.concat(output_data[mm]['infections_daily'], axis=1).dropna().mean(axis=1)[1:] for mm in model_measures } model_plot(dailymodel_ax, 'Infections', 'Daily infections', infection_daily_data, None, smooth_infections.dropna()[1:], output_draws.dropna()[1:], start_date, end_date, False) #whitespace_mid = fig.add_subplot(gs[5:7, 2]) #whitespace_mid.axis('off') cumul_infections_measures = { mm: (pd.concat(output_data[mm]['infections_cumul'], axis=1).dropna().mean(axis=1) / population) * 100 for mm in model_measures } cumul_infections_draws = output_draws.cumsum().dropna() cumul_infections_point = smooth_infections.cumsum().dropna() cumulinfmodel_ax = fig.add_subplot(gs[4:8, 2]) model_plot(cumulinfmodel_ax, None, 'Cumulative infections (%)', cumul_infections_measures, sero_data, (cumul_infections_point / population) * 100, (cumul_infections_draws / population) * 100, start_date, end_date, False) expand_dates = [ date for date in pd.date_range('2019-11-01', end_date) if not date in escape_variant_prevalence.index ] if expand_dates: date_idx = pd.Index(expand_dates, name='date') if isinstance(escape_variant_prevalence, pd.Series): escape_variant_prevalence = pd.concat([ pd.Series(np.nan, index=date_idx, name=escape_variant_prevalence.name), escape_variant_prevalence ]) escape_variant_prevalence = escape_variant_prevalence.fillna( method='bfill').fillna(method='ffill') elif escape_variant_prevalence.empty: escape_variant_prevalence = pd.Series( np.nan, index=date_idx, name='escape_variant_prevalence') cumul_infected_measures = { mm: pd.concat(output_data[mm]['infections_daily'], axis=1).dropna().mean(axis=1) for mm in model_measures } cumul_infected_measures = { mm: (calc_infected( mm_data, escape_variant_prevalence.loc[mm_data.index].values, np.mean(cross_variant_immunity), population, ).dropna() / population) * 100 for mm, mm_data in cumul_infected_measures.items() } cumul_infected_draws = calc_infected( output_draws, escape_variant_prevalence.loc[output_draws.index].to_frame().values, np.array(cross_variant_immunity), population, ).dropna() cumul_infected_point = calc_infected( smooth_infections, escape_variant_prevalence.loc[smooth_infections.index].values, np.mean(cross_variant_immunity), population, ).dropna() del output_draws, smooth_infections # if not daily_reinfection_rr.empty: # sero_data = sero_data.join(daily_reinfection_rr, how='left') # sero_data['inflation_factor'] = sero_data['inflation_factor'].fillna(1) # sero_data['seroprev_mean_no_vacc_waning'] /= sero_data['inflation_factor'] # del sero_data['inflation_factor'] cumulpropmodel_ax = fig.add_subplot(gs[8:12, 2]) model_plot( cumulpropmodel_ax, None, 'Cumulative infected (%)', cumul_infected_measures, None, # sero_data, (cumul_infected_point / population) * 100, (cumul_infected_draws / population) * 100, start_date, end_date, True) #whitespace_bottom = fig.add_subplot(gs[11:12, 2]) #whitespace_bottom.axis('off') fig.suptitle(f'{location_name} ({location_id})', fontsize=20) plt.tight_layout() if plot_dir is not None: plt.switch_backend('pdf') fig.savefig(plot_dir / f'{location_id}.pdf') plt.close(fig) else: plt.show()
from pandas import Series, DataFrame import pandas as pd import numpy as np from numpy import nan as NA ### Descriptive statistics df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df.sum() df.sum(axis=1) # NB for this one NaNs are treated at 0 df.cumsum() df.mean(axis=1, skipna=False) df.describe() # also works on other objects df.idxmax() # returns the id of the index of the max ### Handling Missing Data string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado']) string_data string_data.isnull() string_data[0] = None string_data.isnull() data = Series([1, NA, 3.5, NA, 7])
def make_overview_chart( series: pd.DataFrame, title: str, subtitle_base: str = "Log Returns" ) -> go.Figure: fig = make_subplots( rows=2, cols=2, subplot_titles=[ subtitle_base, f"{subtitle_base} Distribution", f"Cumulative {subtitle_base}", f"Q/Q Plot", ], vertical_spacing=0.09, horizontal_spacing=0.08, ) # Returns Distribution series_cuts = pd.cut(series, 100).value_counts().sort_index() midpoints = series_cuts.index.map(lambda interval: interval.right).to_numpy() norm_dist = stats.norm.pdf(midpoints, loc=series.mean(), scale=series.std()) fig.add_trace( go.Scatter( x=series.index, y=series, line=dict(width=1, color=COLORS[0]), name="return", ), row=1, col=1, ) fig.add_trace( go.Scatter( x=series.index, y=series.cumsum(), line=dict(width=1, color=COLORS[0]), name="cum. return", ), row=2, col=1, ) fig.add_trace( go.Bar( x=[interval.mid for interval in series_cuts.index], y=series_cuts / series_cuts.sum(), name="pct. of returns", marker=dict(color=COLORS[0]), ), row=1, col=2, ) fig.add_trace( go.Scatter( x=[interval.mid for interval in series_cuts.index], y=norm_dist / norm_dist.sum(), name="normal", line=dict(width=1, color=COLORS[1]), ), row=1, col=2, ) # Q/Q Data returns_norm = ((series - series.mean()) / series.std()).sort_values() norm_dist = pd.Series( list(map(stats.norm.ppf, np.linspace(0.001, 0.999, len(series)))), name="normal", ) fig.append_trace( go.Scatter( x=norm_dist, y=returns_norm, name="return norm.", mode="markers", marker=dict(color=COLORS[0], size=3), ), row=2, col=2, ) fig.add_trace( go.Scatter( x=norm_dist, y=norm_dist, name="norm.", line=dict(width=1, color=COLORS[1]), ), row=2, col=2, ) fig.add_annotation( text=(f"{series.cumsum()[-1] * 100:0.1f}%"), xref="paper", yref="y3", x=0.465, y=series.cumsum()[-1], xanchor="left", showarrow=False, align="left", ) fig.add_annotation( get_moments_annotation( series.dropna(), xref="paper", yref="paper", x=0.55, y=0.45, xanchor="left", title="Returns", labels=IS_labels, ), font=dict(size=6, family="Courier New, monospace"), ) fig.update_xaxes(showline=True, linewidth=1, linecolor="black", mirror=True) fig.update_yaxes(showline=True, linewidth=1, linecolor="black", mirror=True) fig.update_layout( title_text=( f"{title}<br>" f"{series.index.min().strftime('%Y-%m-%d %H:%M')}" f" - {series.index.max().strftime('%Y-%m-%d %H:%M')}" ), showlegend=False, height=600, font=dict(size=10), margin=dict(l=50, r=50, b=50, t=100), yaxis=dict(tickformat="0.3f"), yaxis3=dict(tickformat="0.3f"), yaxis2=dict(tickformat="0.3f"), yaxis4=dict(tickformat="0.1f"), xaxis2=dict(tickformat="0.3f"), xaxis4=dict(tickformat="0.1f"), ) for i in fig["layout"]["annotations"]: i["font"]["size"] = 12 fig.update_annotations(font=dict(size=10)) return fig
from pandas import Series, DataFrame import pandas as pd import pandas_datareader.data as web import numpy as np df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) print(df) print(df.sum()) print(df.sum(axis=1)) print(df.idxmax()) print(df.cumsum()) print(df.describe()) obj = Series(['a', 'a', 'b', 'c'] * 4) print(obj.describe()) all_data = {} for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']: print("get data:" + ticker) all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2010', '1/30/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()}) returns = price.pct_change() print(returns.tail()) print(returns.MSFT.corr(returns.IBM)) print(returns.MSFT.cov(returns.IBM)) print(returns.corr())
# Sum method dframe1.sum() # ignores null values (treats them as 0s) dframe1.sum(axis=1) # sum across rows # Min method dframe1.min() # finds the minimum value in each column dframe1.min(axis=1) # minimum value of each row dframe1.idxmin() # Find the index of minimum value column # Max method dframe1.max() dframe1.idxmax() # Cumulative sum dframe1.cumsum() # accumulates along each columns values # Describe method dframe1.describe() # summary statistics of dataframe (by columns) # correlation and covariance import pandas.io.data as pdweb # import pandas_datareader.data as pdweb import datetime prices = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1))['Adj Close'] prices.head() volume = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'],
from pandas import DataFrame df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) print(df) print(df.sum()) #by columns print(df.sum(axis=1)) #by rows print(df.mean(axis=1, skipna=False)) print(df.idxmax()) #index value where the max value print(df.cumsum()) #accumulations print(df.describe()) obj = pd.Series(['a', 'a', 'b', 'c'] * 4) print(obj.describe()) #Correlation and Covariance import pandas.io.data as web all_data = {} for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']: all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010') price = DataFrame(
#CORRECT frame.apply(lambda x: x.max() - x.min() ) ##Map elementwise transformation over a Series ### ApplyMap Elmentwise transformation over a DF # Summary functions frame.mean(axis=1) # agg accross the colums frame.idxmax() frame.cumsum() frame.describe() pct = frame.pct_change() #format to 2dp pct.applymap(lambda x: '%.2f' % x) #newer python version pct.applymap(lambda x: '{0:.2%}'.format(x)) #value counts - calculate across all columns simultanesiously # fillna replaces nans with 0
df1.max()#maximum value along each column # In[78]: df1.idxmax() #maximum index # In[79]: #cummulative sum df1.cumsum() # In[80]: df1.describe() #statistical description of dataset # In[81]: df2 = DataFrame(randn(9).reshape(3,3),index=[1,2,3],columns=list('ABC')) df2
def add_values( log: pd.DataFrame, changes: pd.DataFrame, cashes: pd.DataFrame ) -> pd.DataFrame: """Creates a new df with performance results Parameters ---------- log : pd.DataFrame The dataframe that will have daily holdings changes : pd.DataFrame Transactions that changed holdings cashes : pd.DataFrame Cash changing transactions Returns ---------- log : pd.DataFrame A dataframe with daily holdings """ for index, _ in log.iterrows(): # Add stocks to dataframe values = changes[changes["Date"] == index] if len(values.index) > 0: for _, sub_row in values.iterrows(): ticker = sub_row["Name"] quantity = sub_row["Quantity"] price = sub_row["Price"] fees = sub_row["Fees"] if math.isnan(fees): fees = 0 sign = -1 if sub_row["Side"].lower() == "sell" else 1 pos1 = log.cumsum().at[index, ("Quantity", ticker)] > 0 pos2 = (quantity * sign) > 0 if sub_row["Side"].lower() == "interest": log.at[index, ("Cost Basis", ticker)] = ( log.at[index, ("Cost Basis", ticker)] + quantity * price ) log.at[index, ("Cash", "Cash")] = log.at[ index, ("Cash", "Cash") ] - (quantity * price) elif ( pos1 == pos2 or log.cumsum().at[index, ("Quantity", ticker)] == 0 or (quantity * sign) == 0 ): log.at[index, ("Quantity", ticker)] = ( log.at[index, ("Quantity", ticker)] + quantity * sign ) log.at[index, ("Cost Basis", ticker)] = ( log.at[index, ("Cost Basis", ticker)] + fees + quantity * sign * price ) log.at[index, ("Cash", "Cash")] = log.at[ index, ("Cash", "Cash") ] - (fees + quantity * sign * price) else: rev = ( log.at[index, ("Profit", ticker)] + quantity * sign * price * -1 ) wa_cost = ( quantity / log.cumsum().at[index, ("Quantity", ticker)] ) * log.cumsum().at[index, ("Cost Basis", ticker)] log.at[index, ("Profit", ticker)] = rev - wa_cost - fees log.at[index, ("Cash", "Cash")] = ( log.at[index, ("Cash", "Cash")] + rev - fees ) log.at[index, ("Quantity", ticker)] = ( log.at[index, ("Quantity", ticker)] + quantity * sign ) log.at[index, ("Cost Basis", ticker)] = ( log.at[index, ("Cost Basis", ticker)] - wa_cost ) cash_vals = cashes[cashes["Date"] == index] if len(cash_vals.index) > 0: for _, sub_row in cash_vals.iterrows(): amount = sub_row["Price"] quantity = sub_row["Quantity"] if sub_row["Side"] == "deposit": d = 1 elif sub_row["Side"] == "withdrawal": d = -1 else: raise ValueError("Cash type must be deposit or withdrawal") log.at[index, ("Cash", "Cash")] = ( log.at[index, ("Cash", "Cash")] + d * amount * quantity ) log.at[index, ("Cash", "User")] = ( log.at[index, ("Cash", "User")] + d * amount * quantity ) return log
def _cumulative_returns(returns: pd.DataFrame, is_log: bool): return returns.cumsum() if is_log else returns.add(1).cumprod().sub(1)
#按照行进行排序 df5.sort_index(axis=1) df6 = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]}) #按照a b进行排序 df6.sort_index(by=['a','b']) #基础统计功能 df7 = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two']) df7.sum() df7.sum(axis=1) #包含NaN的值,不进行平均值计算 df7.mean(axis=1,skipna=False) #最大值所在的索引 df7.idxmax() #返回累加和 df7.cumsum() #返回多种统计集合的结果 df7.describe() #唯一值和值计数 obj = Series(['c','a','d','a','a','b','b','c','c']) unique = obj.unique() obj.value_counts() pd.value_counts(obj.values,sort=True) mask = obj.isin(['b','c']) obj[mask] #缺失数据判断 data8 = Series(['a','b',np.nan,'d']) data8.isnull() data8[2] = None
pd.isnull(df) df.isnull() df.sum() df.sum(axis = 1) df.mean() df.mean(skipna = False) df.mean(axis = 1) df.mean(axis = 1, skipna = False) np.mean(df, axis = 1) df.idxmax() # 열기준 최고값 인덱스 : 과목별 고득점자 df.idxmin() # 열기준 최소값 인덱스 : 과목별 저득점자 df.cumsum() # row단위 누적합 df.cumsum(axis = 1) # col단위 누적합 df['영어'].sum() df['영어'].mean() df['영어'].var() df['영어'].std() df['영어'].max() df['영어'].min() df.loc['홍길동'].sum() df.loc['박찬호'].mean() df.describe() '''
from pandas import Series, DataFrame, date_range from numpy.random import * from datetime import datetime import matplotlib.pyplot as plt seed(123456) ## basic 1 ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) ts = ts.cumsum() ts.plot() plt.show() ## basic 2 df = DataFrame(randn(1000, 4), index=ts.index, columns=list('ABCD')) df = df.cumsum() df.plot(use_index=True) plt.show() ## basic 3 df3 = DataFrame(randn(1000, 2), columns=['B', 'C']).cumsum() df3['A'] = Series(list(range(len(df3)))) df3.plot(x='A', y='B') plt.show() ## barplot1 df.ix[5].plot(kind='bar') plt.axhline(0, color='k') plt.show() ## barplot2
Two 2.0 Three 4.0 dtype: float64 ''' # index of min value print(dframe1.idxmin()) ''' One A Two A Three B dtype: object ''' # acumulation print(dframe1.cumsum()) ''' One Two Three A 1.0 2.0 NaN B NaN 5.0 4.0 ''' # describe print(dframe1) ''' One Two Three A 1.0 2.0 NaN B NaN 3.0 4.0 ''' print(dframe1.describe()) '''
def plotter(df, title=False, kind='line', x_label=None, y_label=None, style='ggplot', figsize=(8, 4), save=False, legend_pos='best', reverse_legend='guess', num_to_plot=6, tex='try', colours='default', cumulative=False, pie_legend=True, partial_pie=False, show_totals=False, transparent=False, output_format='png', black_and_white=False, show_p_val=False, indices=False, transpose=False, rot=False, **kwargs): """Visualise corpus interrogations. :param title: A title for the plot :type title: str :param df: Data to be plotted :type df: Pandas DataFrame :param x_label: A label for the x axis :type x_label: str :param y_label: A label for the y axis :type y_label: str :param kind: The kind of chart to make :type kind: str ('line'/'bar'/'barh'/'pie'/'area') :param style: Visual theme of plot :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc) :param figsize: Size of plot :type figsize: tuple (int, int) :param save: If bool, save with *title* as name; if str, use str as name :type save: bool/str :param legend_pos: Where to place legend :type legend_pos: str ('upper right'/'outside right'/etc) :param reverse_legend: Reverse the order of the legend :type reverse_legend: bool :param num_to_plot: How many columns to plot :type num_to_plot: int/'all' :param tex: Use TeX to draw plot text :type tex: bool :param colours: Colourmap for lines/bars/slices :type colours: str :param cumulative: Plot values cumulatively :type cumulative: bool :param pie_legend: Show a legend for pie chart :type pie_legend: bool :param partial_pie: Allow plotting of pie slices only :type partial_pie: bool :param show_totals: Print sums in plot where possible :type show_totals: str -- 'legend'/'plot'/'both' :param transparent: Transparent .png background :type transparent: bool :param output_format: File format for saved image :type output_format: str -- 'png'/'pdf' :param black_and_white: Create black and white line styles :type black_and_white: bool :param show_p_val: Attempt to print p values in legend if contained in df :type show_p_val: bool :param indices: To use when plotting "distance from root" :type indices: bool :param stacked: When making bar chart, stack bars on top of one another :type stacked: str :param filled: For area and bar charts, make every column sum to 100 :type filled: str :param legend: Show a legend :type legend: bool :param rot: Rotate x axis ticks by *rot* degrees :type rot: int :param subplots: Plot each column separately :type subplots: bool :param layout: Grid shape to use when *subplots* is True :type layout: tuple -- (int, int) :returns: matplotlib figure """ kwargs['rot'] = rot xtickspan = kwargs.pop('xtickspan', False) # if the data was multiindexed, the default is a little different! if isinstance(df.index, MultiIndex): import matplotlib.pyplot as nplt shape = kwargs.get('shape', 'auto') truncate = kwargs.get('truncate', 8) if shape == 'auto': shape = (int(len(df.index.levels[0]) / 2), 2) f, axes = nplt.subplots(*shape) for i, ((name, data), ax) in enumerate(zip(df.groupby(level=0), axes.flatten())): data = data.loc[name] if isinstance(truncate, int) and i > truncate: continue if kwargs.get('name_format'): name = kwargs.get('name_format').format(name) data.chart( title=name, ax=ax, kind=kind, x_label=x_label, y_label=y_label, style=style, figsize=figsize, save=save, legend_pos=legend_pos, reverse_legend=reverse_legend, num_to_plot=num_to_plot, tex=tex, colours=colours, cumulative=cumulative, pie_legend=pie_legend, partial_pie=partial_pie, show_totals=show_totals, transparent=transparent, output_format=output_format, black_and_white=black_and_white, show_p_val=show_p_val, indices=indices, transpose=transpose, rot=rot) return nplt title = title or "" # get a few options from kwargs sbplt = kwargs.get('subplots', False) show_grid = kwargs.pop('grid', True) the_rotation = kwargs.get('rot', False) dragmode = kwargs.pop('draggable', False) leg_frame = kwargs.pop('legend_frame', True) leg_alpha = kwargs.pop('legend_alpha', 0.8) # auto set num to plot based on layout lo = kwargs.get('layout', None) if lo: num_to_plot = lo[0] * lo[1] # todo: get this dynamically instead. styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white'] if style == 'mpl-white': try: sns.set_style("whitegrid") except: pass style = 'matplotlib' if kwargs.get('savepath'): mpl.rcParams['savefig.directory'] = kwargs.get('savepath') kwargs.pop('savepath', None) mpl.rcParams['savefig.bbox'] = 'tight' mpl.rcParams.update({'figure.autolayout': True}) # try to use tex # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if show_totals is False: show_totals = 'none' # find out what kind of plot we're making kwargs['kind'] = kind.lower() # find out if pie mode, add autopct format piemode = kind == "pie" if piemode: # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' # copy data, make series into df dataframe = df.copy() if kind == 'heatmap': try: dataframe = dataframe.T except: pass was_series = isinstance(dataframe, Series) if was_series: was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if transpose: dataframe = dataframe.T if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': indices = all([isint(x) for x in list(dataframe.columns)]) # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf'] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if piemode and not sbplt and kwargs.get('explode'): kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series=was_series, num_to_plot=num_to_plot) else: kwargs.pop('explode', None) legend = kwargs.get('legend', True) if not was_series: if transpose: dataframe = dataframe.head(num_to_plot) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis=1, errors='ignore') except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val and there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] pstr = p_string_formatter(pval, using_tex) newname = '%s (%s)' % (col, pstr) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis=0, inplace=True, errors='ignore') elif there_are_p_vals: dataframe.drop(statfields, axis=0, inplace=True, errors='ignore') # make and set y label absolutes = True if isinstance(dataframe, DataFrame): try: if not all([s.is_integer() for s in dataframe.iloc[0,:].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False ########################################## ################ COLOURS ################# ########################################## # set defaults, with nothing for heatmap yet if colours is True or colours == 'default' or colours == 'Default': if kind != 'heatmap': colours = 'viridis' else: colours = 'default' # assume it's a single color, unless string denoting map cmap_or_c = 'color' if isinstance(colours, str): cmap_or_c = 'colormap' from matplotlib.colors import LinearSegmentedColormap if isinstance(colours, LinearSegmentedColormap): cmap_or_c = 'colormap' # for heatmaps, it's always a colormap if kind == 'heatmap': cmap_or_c = 'cmap' # if it's a defaulty string, set accordingly if isinstance(colours, str): if colours.lower().startswith('diverg'): colours = sns.diverging_palette(10, 133, as_cmap=True) # if default not set, do diverge for any df with a number < 0 elif colours.lower() == 'default': mn = dataframe.min() if isinstance(mn, Series): mn = mn.min() if mn < 0: colours = sns.diverging_palette(10, 133, as_cmap=True) else: colours = sns.light_palette("green", as_cmap=True) if 'seaborn' not in style: kwargs[cmap_or_c] = colours # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if kind in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kind == 'pie': if pie_legend: legend = True else: legend = False if kind in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # no title for subplots because ugly, if title and not sbplt: kwargs['title'] = title # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: if num_to_plot > 6: if not kwargs.get('ncol'): kwargs['ncol'] = num_to_plot // 7 # kwarg options go in leg_options leg_options = {'framealpha': leg_alpha, 'shadow': kwargs.get('shadow', False), 'ncol': kwargs.pop('ncol', 1)} # determine legend position based on this dict if legend_pos: possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left'} if isinstance(legend_pos, int): the_loc = legend_pos elif isinstance(legend_pos, str): try: the_loc = possible[legend_pos] except KeyError: raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(list(possible.keys()))) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if isinstance(legend_pos, str): if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes) if piemode and partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode and not sbplt: kwargs['y'] = list(dataframe.columns)[0] areamode = False if kind == 'area': areamode = True if legend is False: kwargs['legend'] = False if kwargs.get('filled'): if areamode or kind.startswith('bar'): dataframe = filler(dataframe) kwargs.pop('filled', None) MARKERSIZE = 4 COLORMAP = { 0: {'marker': None, 'dash': (None,None)}, 1: {'marker': None, 'dash': [5,5]}, 2: {'marker': "o", 'dash': (None,None)}, 3: {'marker': None, 'dash': [1,3]}, 4: {'marker': "s", 'dash': [5,2,5,2,5,10]}, 5: {'marker': None, 'dash': [5,3,1,2,1,10]}, 6: {'marker': 'o', 'dash': (None,None)}, 7: {'marker': None, 'dash': [5,3,1,3]}, 8: {'marker': "1", 'dash': [1,3]}, 9: {'marker': "*", 'dash': [5,5]}, 10: {'marker': "2", 'dash': [5,2,5,2,5,10]}, 11: {'marker': "s", 'dash': (None,None)} } HATCHES = { 0: {'color': '#dfdfdf', 'hatch':"/"}, 1: {'color': '#6f6f6f', 'hatch':"\\"}, 2: {'color': 'b', 'hatch':"|"}, 3: {'color': '#dfdfdf', 'hatch':"-"}, 4: {'color': '#6f6f6f', 'hatch':"+"}, 5: {'color': 'b', 'hatch':"x"} } if black_and_white: if kind == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kind == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs[cmap_or_c] = new_cmap # remove things from kwargs if heatmap if kind == 'heatmap': number_format = ".2f" if all(dataframe[i].astype(str).str.isdigit().all() for i in list(dataframe.columns)): number_format = None hmargs = {'annot': kwargs.pop('annot', True), cmap_or_c: kwargs.pop(cmap_or_c, None), 'cbar': kwargs.pop('cbar', False)} if number_format: hmargs['fmt'] = number_format for i in ['vmin', 'vmax', 'linewidths', 'linecolor', 'robust', 'center', 'cbar_kws', 'cbar_ax', 'square', 'mask', 'norm']: if i in kwargs.keys(): hmargs[i] = kwargs.pop(i, None) class dummy_context_mgr(): """a fake context for plotting without style perhaps made obsolete by 'classic' style in new mpl""" def __enter__(self): return None def __exit__(self, one, two, three): return False with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr(): kwargs.pop('filled', None) if not sbplt: # check if negative values, no stacked if so if areamode: if not kwargs.get('ax'): kwargs['legend'] = False if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False if kind != 'heatmap': # turn off pie labels at the last minute if kind == 'pie' and pie_legend: kwargs['labels'] = None kwargs['autopct'] = '%.2f' if kind == 'pie': kwargs.pop('color', None) ax = dataframe.plot(figsize=figsize, **kwargs) else: fg = plt.figure(figsize=figsize) if title: plt.title(title) ax = kwargs.get('ax', plt.axes()) tmp = sns.heatmap(dataframe, ax=ax, **hmargs) ax.set_title(title) for item in tmp.get_yticklabels(): item.set_rotation(0) return tmp # not good, but otherwise it doesn't show up! if areamode and not kwargs.get('ax'): handles, labels = plt.gca().get_legend_handles_labels() del handles del labels if x_label: ax.set_xlabel(x_label) if y_label: ax.set_ylabel(y_label) else: if not kwargs.get('layout'): plt.gcf().set_tight_layout(False) if kind != 'heatmap': ax = dataframe.plot(figsize=figsize, **kwargs) else: plt.figure(figsize=figsize) if title: plt.title(title) ax = plt.axes() sns.heatmap(dataframe, ax=ax, **hmargs) plt.xticks(rotation=0) plt.yticks(rotation=0) if sbplt: if 'layout' not in kwargs: axes = [l for l in ax] else: axes = [] cols = [l for l in ax] for col in cols: for bit in col: axes.append(bit) for index, a in enumerate(axes): if xtickspan is not False: a.xaxis.set_major_locator(ticker.MultipleLocator(xtickspan)) labels = [item.get_text() for item in a.get_xticklabels()] rotation = rotate_degrees(the_rotation, labels) try: if the_rotation == 0: ax.set_xticklabels(labels, rotation=rotation, ha='center') else: ax.set_xticklabels(labels, rotation=rotation, ha='right') except AttributeError: pass else: if kind == 'heatmap': labels = [item.get_text() for item in ax.get_xticklabels()] rotation = rotate_degrees(the_rotation, labels) if the_rotation == 0: ax.set_xticklabels(labels, rotation=rotation, ha='center') else: ax.set_xticklabels(labels, rotation=rotation, ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white and kind == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(list(COLORMAP.keys())): c = 0 # draw legend with proper placement etc if legend and not piemode and not sbplt and kind != 'heatmap': handles, labels = plt.gca().get_legend_handles_labels() # area doubles the handles and labels. this removes half: #if areamode: # handles = handles[-len(handles) / 2:] # labels = labels[-len(labels) / 2:] if rev_leg: handles = handles[::-1] labels = labels[::-1] if kwargs.get('ax'): lgd = plt.gca().legend(handles, labels, **leg_options) ax.get_legend().draw_frame(leg_frame) else: lgd = plt.legend(handles, labels, **leg_options) lgd.draw_frame(leg_frame) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now # if isinstance(dataframe.index, pandas.tseries.period.PeriodIndex): # x_label = 'Year' y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.gca().suptitle(title, fontsize = 16) #plt.subplots_adjust(top=0.9) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass #try: # from matplotlib.ticker import MaxNLocator # from corpkit.process import is_number # indx = list(dataframe.index) # if all([is_number(qq) for qq in indx]): # ax.get_xaxis().set_major_locator(MaxNLocator(integer=True)) #except: # pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') a.grid(b=show_grid) # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: # show grid ax.grid(b=show_grid) if kind.startswith('bar'): width = ax.containers[0][0].get_width() if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0,the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha='center', va='bottom') else: plt.annotate(score, (i, score), ha='center', va='bottom') if not kwargs.get('layout') and not sbplt and not kwargs.get('ax'): plt.tight_layout() if kwargs.get('ax'): try: plt.gcf().set_tight_layout(False) except: pass try: plt.set_tight_layout(False) except: pass if save: imagefolder = 'images' savename = get_savename(imagefolder, save=save, title=title, ext=output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o') and not sbplt: plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format=output_format) else: plt.gcf().savefig(savename, dpi=150, format=output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print('\n' + time + ": " + savename + " created.") else: raise ValueError("Error making %s." % savename) if dragmode: plt.legend().draggable() if sbplt: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) return plt
def main(): """ Calculation and aggregation of summary statistics """ # Summary of statistics # return is not ndarray df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one', 'two']) print df print df.sum() print df.sum(axis=1) print df.mean(axis=1) # exclude nan print df.mean(axis=1, skipna=False) print df.idxmin() print df.idxmax() print df.cumsum() print df.describe() # values are not number obj = Series(list('aabc') * 4) print obj.describe() methods = ['count', 'min', 'max', # 'argmin', 'argmax', 'quantile', 'median', 'mad', 'var', 'std', 'skew', 'kurt', 'cummin', 'cummax', 'cumprod', 'diff', 'pct_change'] for method in methods: print u'「{0}」'.format(method) print getattr(df, method)() print '' # Correspond and Covariance all_data = {} lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']: for ticket in lst: #, 'GOOG']: # IOError: after 3 tries, Yahoo! did not return a 200 # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv' all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) if all_data: returns = price.pct_change() print returns.tail() print '' print returns.MSFT.corr(returns.IBM) print returns.MSFT.cov(returns.IBM) print '' print returns.corr() print returns.cov() print '' print returns.corrwith(returns.IBM) print returns.corrwith(volume) # unique, frequency, belong print '','' obj = Series(list('cadaabbcc')) uniques = obj.unique() print uniques print obj.value_counts() print pd.value_counts(obj.values, sort=False) mask = obj.isin(['b', 'c']) print mask print obj[mask] data = DataFrame({ 'Qu1' : [1,3,4,3,4], 'Qu2' : [2,3,1,2,3], 'Qu3' : [1,5,2,4,4], }) print data print data.apply(pd.value_counts).fillna(0)
def cumsum(df: pd.DataFrame) -> pd.DataFrame: return df.cumsum()
# sorting and ranking frame_sort = frame.sort_index() print(frame_sort) frame_sort = frame.sort_values(by='b') print(frame_sort) # Summarizing and computing describtive statistics df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) print(df.sum(axis=0)) print(df.mean(axis=1)) print(df.cumsum(axis=0)) # correlation and covariance df = DataFrame(np.random.randn(100, 4), columns=['AAPL', 'GOOG', 'IBM', 'MSFT']) df_corr = df.corr() df_cov = df.cov() print(df_corr) print(df_cov) # Unique values, values counts and membership data = DataFrame(np.random.randint(0, 5, (5, 3)), columns=list('abc')) print(data) result = data.apply(pd.value_counts).fillna(0) print(result)
class Portfolio(object): """This class represents portfolio and its events.""" def __repr__(self): return '<Portfolio {}>'.format(self.prices.shape) def __init__(self, ohlcs, starting_capital=100000, price_type='cprices', transaction_fee_bps=15., transaction_fee_min=7): self.price_type = price_type self.transaction_fee_bps = transaction_fee_bps self.transaction_fee_min = transaction_fee_min self.prices = self.from_ohlcs(ohlcs, price_type) self.volumes = self.from_ohlcs(ohlcs, 'volumes') self.trades = DataFrame(zeros(self.prices.shape), self.prices.index, self.prices.columns) self.fees = DataFrame(zeros(self.prices.shape), self.prices.index, self.prices.columns) self.starting_capital = starting_capital self.capital = [] self.quantities = [] self.values = [] self.refresh() def from_ohlcs(self, ohlcs, price_type): """Set prices using a list of ohlc classes.""" dfs = [] for ohlc in ohlcs: df = DataFrame(getattr(ohlc, price_type), posix_as_dt(ohlc.timestamps)) dfs.append(df) prices = concat(dfs, join='outer', axis=1) prices.columns = [ohlc.symbol for ohlc in ohlcs] return prices.fillna(method='pad') def refresh(self): """Calculates positions, values, free capital and costs from trades. Fees of short positions (if any) are same as cost for long. This is not realistic, but the class is intended to represent long only portfolios. """ self.fees = self.transaction_fee_bps * self.trades.abs() * \ self.prices / 10000 small = self.fees < self.transaction_fee_min nonzero = self.trades.abs() > 0 self.fees[small * nonzero] = self.transaction_fee_min self.quantities = self.trades.cumsum() self.values = self.quantities * self.prices self.capital = self.starting_capital + self.total_trade_values - \ self.total_fees def trade(self, timestring, symbol, quantity): """Convenience function to enter trades and refresh.""" self.trades[symbol][timestring] = quantity self.refresh() def trade_max(self): """Trade all capital on first day, equal sized positions.""" first_day = dt_as_str(self.prices.index[0]) trade_sizes = zeros(len(self.prices.columns)) trade_sizes[:] = self.starting_capital / float(len(trade_sizes)) trade_sizes = [trade_sizes[ind] / self.prices.iloc[0].values[ind] for ind in arange(len(trade_sizes))] trade_sizes = trunc(trade_sizes) for ind in arange(len(self.prices.columns)): self.trade(first_day, self.prices.columns[ind], trade_sizes[ind]) self.refresh() @property def market_value(self): """Value of equity positions at each time.""" return self.values.sum(axis=1) @property def total_value(self): """Total value of portfolio at each time.""" return self.market_value + self.capital @property def trade_values(self): """Trade values for each trade.""" tvals = -self.trades * self.prices to_0 = (tvals == 0) + (isnull(tvals)) tvals[to_0] = 0 return tvals @property def total_trade_values(self): """Cumulative sum of all trades.""" return self.trade_values.sum(axis=1).cumsum() @property def total_fees(self): """Cumulative sum of fees.""" return self.fees.sum(axis=1).cumsum()
columns=['one', 'two']) df df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df.sum() # columns sum df.sum(axis=1) # sum row by row df (7.10 - 4.5)/2 df.mean(axis=1, skipna=False) df df.idxmax() df df.cumsum() # accumultation df.describe() # multiple summary statistics in one shot. obj = Series(['a', 'a', 'b', 'c'] * 4) obj obj.describe() ## Correlation and Covariance import pandas.io.data as web all_data = {} for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']: all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'], for tic, data in all_data.iteritems()}) price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) price
print(team_df.sum()) print("----") print(team_df.mean()) print("----") # ### 팀별 요약값을 보고 싶다. # In[33]: team_df.describe() # In[34]: ## 날짜별 누적 통계 team_df.cumsum() # ### 날짜별 합계 # In[35]: ## 날짜별 합계 print(team_df.sum(axis=1)) # In[36]: rowsum = team_df.sum(axis=1) print(type(rowsum)) # In[37]:
import matplotlib.pyplot as plt array1 = np.array([[10, np.nan, 20], [30, 40, np.nan]]) print array1 df1 = DataFrame(array1, index=[1, 2], columns=list('ABC')) print df1 #sum() print "Sum of cols", df1.sum() #sums along each column print df1.sum(axis=1) #sum along indexes print "Min", df1.min() print "Max", df1.max() print df1.idxmax() print df1.cumsum() print df1.describe() df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC')) print df2 plt.plot(df2) plt.legend(df2.columns, loc="lower right") plt.savefig('samplepic.png') plt.show() series1 = Series(list('abcccaabd')) print series1.unique() print series1.value_counts()
res = pd.merge(left, right, left_index=True, right_index=True, how='outer') res = pd.merge(left, right, left_index=True, right_index=True, how='inner') # handle overlapping boys = DataFrame({'k': ['k0', 'k1', 'k2'], 'age': [1, 2, 3]}) girls = DataFrame({'k': ['k0', 'k0', 'k3'], 'age': [4, 5, 6]}) res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner') res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer') # pandas plot # Series data = Series(np.random.randn(1000), index=np.arange(1000)) data = data.cumsum() data.plot() plt.show() # DataFrame data = DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list('abcd')) data = data.cumsum() print(data.head()) data.plot() plt.show() # plot methods: bar, hist, box, kde, area, scatter, hexbin, pie ax = data.plot.scatter(x='a', y='b', color='DarkBlue', label='Class 1')
def plotter(title, df, kind = 'line', x_label = None, y_label = None, style = 'ggplot', figsize = (8, 4), save = False, legend_pos = 'best', reverse_legend = 'guess', num_to_plot = 7, tex = 'try', colours = 'Accent', cumulative = False, pie_legend = True, partial_pie = False, show_totals = False, transparent = False, output_format = 'png', interactive = False, black_and_white = False, show_p_val = False, indices = False, **kwargs): """Visualise corpus interrogations. :param title: A title for the plot :type title: str :param df: Data to be plotted :type df: pandas.core.frame.DataFrame :param x_label: A label for the x axis :type x_label: str :param y_label: A label for the y axis :type y_label: str :param kind: The kind of chart to make :type kind: str ('line'/'bar'/'barh'/'pie'/'area') :param style: Visual theme of plot :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc) :param figsize: Size of plot :type figsize: tuple (int, int) :param save: If bool, save with *title* as name; if str, use str as name :type save: bool/str :param legend_pos: Where to place legend :type legend_pos: str ('upper right'/'outside right'/etc) :param reverse_legend: Reverse the order of the legend :type reverse_legend: bool :param num_to_plot: How many columns to plot :type num_to_plot: int/'all' :param tex: Use TeX to draw plot text :type tex: bool :param colours: Colourmap for lines/bars/slices :type colours: str :param cumulative: Plot values cumulatively :type cumulative: bool :param pie_legend: Show a legend for pie chart :type pie_legend: bool :param partial_pie: Allow plotting of pie slices only :type partial_pie: bool :param show_totals: Print sums in plot where possible :type show_totals: str -- 'legend'/'plot'/'both' :param transparent: Transparent .png background :type transparent: bool :param output_format: File format for saved image :type output_format: str -- 'png'/'pdf' :param black_and_white: Create black and white line styles :type black_and_white: bool :param show_p_val: Attempt to print p values in legend if contained in df :type show_p_val: bool :param indices: To use when plotting "distance from root" :type indices: bool :param stacked: When making bar chart, stack bars on top of one another :type stacked: str :param filled: For area and bar charts, make every column sum to 100 :type filled: str :param legend: Show a legend :type legend: bool :param rot: Rotate x axis ticks by *rot* degrees :type rot: int :param subplots: Plot each column separately :type subplots: bool :param layout: Grid shape to use when *subplots* is True :type layout: tuple -- (int, int) :param interactive: Experimental interactive options :type interactive: list -- [1, 2, 3] :returns: matplotlib figure """ import corpkit import os try: from IPython.utils.shimmodule import ShimWarning import warnings warnings.simplefilter('ignore', ShimWarning) except: pass import matplotlib as mpl from matplotlib import rc # prefer seaborn plotting try: import seaborn as sns except: pass if interactive: import matplotlib.pyplot as plt, mpld3 else: import matplotlib.pyplot as plt import pandas from pandas import DataFrame import numpy from time import localtime, strftime from tests import check_pytex, check_spider, check_t_kinter if interactive: import mpld3 import collections from mpld3 import plugins, utils from plugins import InteractiveLegendPlugin, HighlightLines # check what environment we're in tk = check_t_kinter() running_python_tex = check_pytex() running_spider = check_spider() def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100): """remove extreme values from colourmap --- no pure white""" import matplotlib.colors as colors import numpy as np new_cmap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))) return new_cmap def get_savename(imagefolder, save = False, title = False, ext = 'png'): """Come up with the savename for the image.""" import os def urlify(s): "Turn title into filename" import re s = s.lower() s = re.sub(r"[^\w\s-]", '', s) s = re.sub(r"\s+", '-', s) s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s) return s # name as if not ext.startswith('.'): ext = '.' + ext if type(save) == str: savename = os.path.join(imagefolder, (urlify(save) + ext)) #this 'else' is redundant now that title is obligatory else: if title: filename = urlify(title) + ext savename = os.path.join(imagefolder, filename) # remove duplicated ext if savename.endswith('%s%s' % (ext, ext)): savename = savename.replace('%s%s' % (ext, ext), ext, 1) return savename def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True): """adds totals (abs, rel, keyness) to entry name strings""" if was_series: where_the_words_are = dataframe.index else: where_the_words_are = dataframe.columns the_labs = [] for w in list(where_the_words_are): if not absolutes: if was_series: perc = dataframe.T[w][0] else: the_labs.append(w) continue if using_tex: the_labs.append('%s (%.2f\%%)' % (w, perc)) else: the_labs.append('%s (%.2f %%)' % (w, perc)) else: if was_series: score = dataframe.T[w].sum() else: score = dataframe[w].sum() if using_tex: the_labs.append('%s (n=%d)' % (w, score)) else: the_labs.append('%s (n=%d)' % (w, score)) if not was_series: dataframe.columns = the_labs else: vals = list(dataframe[list(dataframe.columns)[0]].values) dataframe = pandas.DataFrame(vals, index = the_labs) dataframe.columns = ['Total'] return dataframe def auto_explode(dataframe, input, was_series = False, num_to_plot = 7): """give me a list of strings and i'll output explode option""" output = [0 for s in range(num_to_plot)] if was_series: l = list(dataframe.index) else: l = list(dataframe.columns) if type(input) == str or type(input) == int: input = [input] if type(input) == list: for i in input: if type(i) == str: index = l.index(i) else: index = i output[index] = 0.1 return output # check if we're doing subplots sbplt = False if 'subplots' in kwargs: if kwargs['subplots'] is True: sbplt = True kwargs['subplots'] = sbplt if colours is True: colours = 'Paired' # todo: get this dynamically instead. styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white'] #if style not in styles: #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles))) if style == 'mpl-white': try: sns.set_style("whitegrid") except: pass style = 'matplotlib' if style is not False and style.startswith('seaborn'): colours = False # use 'draggable = True' to make a draggable legend dragmode = kwargs.get('draggable', False) kwargs.pop('draggable', None) if kwargs.get('savepath'): mpl.rcParams['savefig.directory'] = kwargs.get('savepath') kwargs.pop('savepath', None) mpl.rcParams['savefig.bbox'] = 'tight' mpl.rcParams.update({'figure.autolayout': True}) # try to use tex # TO DO: # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.latex.unicode'] = True if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if interactive: using_tex = False if show_totals is False: show_totals = 'none' # find out what kind of plot we're making, and enable # or disable interactive values if need be kwargs['kind'] = kind.lower() if interactive: if kwargs['kind'].startswith('bar'): interactive_types = [3] elif kwargs['kind'] == 'area': interactive_types = [2, 3] elif kwargs['kind'] == 'line': interactive_types = [2, 3] elif kwargs['kind'] == 'pie': interactive_types = None warnings.warn('Interactive plotting not yet available for pie plots.') else: interactive_types = [None] if interactive is False: interactive_types = [None] # find out if pie mode, add autopct format piemode = False if kind == 'pie': piemode = True # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' # copy data, make series into df dataframe = df.copy() was_series = False if type(dataframe) == pandas.core.series.Series: was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # attempt to convert x axis to ints: try: dataframe.index = [int(i) for i in list(dataframe.index)] except: pass # remove totals and tkinter order if not was_series and not all(x.lower() == 'total' for x in list(dataframe.columns)): for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): try: dataframe = dataframe.drop(name, axis = ax, errors = 'ignore') except: pass else: dataframe = dataframe.drop('tkintertable-order', errors = 'ignore') dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore') # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': def isint(x): try: a = float(x) b = int(a) except ValueError or OverflowError: return False else: return a == b if all([isint(x) is True for x in list(dataframe.columns)]): indices = True else: indices = False # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf'] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if piemode and not sbplt and kwargs.get('explode'): kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series = was_series, num_to_plot = num_to_plot) else: kwargs.pop('explode', None) legend = kwargs.get('legend', False) #cut data short plotting_a_totals_column = False if was_series: if list(dataframe.columns)[0] != 'Total': try: can_be_ints = [int(x) for x in list(dataframe.index)] num_to_plot = len(dataframe) except: dataframe = dataframe[:num_to_plot] elif list(dataframe.columns)[0] == 'Total': plotting_a_totals_column = True if not 'legend' in kwargs: legend = False num_to_plot = len(dataframe) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis = 1, errors = 'ignore') except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val: if there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] def p_string_formatter(val): if val < 0.001: if not using_tex: return 'p < 0.001' else: return r'p $<$ 0.001' else: return 'p = %s' % format(val, '.3f') pstr = p_string_formatter(pval) newname = '%s (%s)' % (col, pstr) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore') else: warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.') else: if there_are_p_vals: dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore') # make and set y label absolutes = True if type(dataframe) == pandas.core.frame.DataFrame: try: if not all([s.is_integer() for s in dataframe.iloc[0,:].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False # use colormap if need be: if num_to_plot > 0: if not was_series: if kind in ['pie', 'line', 'area']: if colours: if not plotting_a_totals_column: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours #else: if colours: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours if piemode: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours else: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours # multicoloured bar charts if colours: if kind.startswith('bar'): if len(list(dataframe.columns)) == 1: if not black_and_white: import numpy as np the_range = np.linspace(0, 1, num_to_plot) cmap = plt.get_cmap(colours) kwargs['colors'] = [cmap(n) for n in the_range] # make a bar width ... ? ... #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5 # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if kind in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kind == 'pie': if pie_legend: legend = True else: legend = False if kind in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # cut dataframe if just_totals try: tst = dataframe['Combined total'] dataframe = dataframe.head(num_to_plot) except: pass # rotate automatically if 'rot' not in kwargs: if not was_series: xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]] #if 'kind' in kwargs: #if kwargs['kind'] in ['barh', 'area']: #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] else: xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] if len(max(xvals, key=len)) > 6: if not piemode: kwargs['rot'] = 45 # no title for subplots because ugly, if title and not sbplt: kwargs['title'] = title # no interactive subplots yet: if sbplt and interactive: import warnings interactive = False warnings.warn('No interactive subplots yet, sorry.') return # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: if num_to_plot > 6: if not kwargs.get('ncol'): kwargs['ncol'] = num_to_plot / 7 # kwarg options go in leg_options leg_options = {'framealpha': .8, 'shadow': kwargs.get('shadow', False), 'ncol': kwargs.pop('ncol', 1)} # determine legend position based on this dict if legend_pos: possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left'} if type(legend_pos) == int: the_loc = legend_pos elif type(legend_pos) == str: try: the_loc = possible[legend_pos] except KeyError: raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(list(possible.keys()))) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if type(legend_pos) == str: if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) if piemode: if partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode: if not sbplt: kwargs['y'] = list(dataframe.columns)[0] if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.columns) else: if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.index) def filler(df): pby = df.T.copy() for i in list(pby.columns): tot = pby[i].sum() pby[i] = pby[i] * 100.0 / tot return pby.T areamode = False if kind == 'area': areamode = True if legend is False: kwargs['legend'] = False # line highlighting option for interactive! if interactive: if 2 in interactive_types: if kind == 'line': kwargs['marker'] = ',' if not piemode: kwargs['alpha'] = 0.1 # convert dates --- works only in my current case! if plotting_a_totals_column or not was_series: try: can_it_be_int = int(list(dataframe.index)[0]) can_be_int = True except: can_be_int = False if can_be_int: if 1500 < int(list(dataframe.index)[0]): if 2050 > int(list(dataframe.index)[0]): n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A') dataframe = dataframe.set_index(n) if kwargs.get('filled'): if areamode or kind.startswith('bar'): dataframe = filler(dataframe) kwargs.pop('filled', None) MARKERSIZE = 4 COLORMAP = { 0: {'marker': None, 'dash': (None,None)}, 1: {'marker': None, 'dash': [5,5]}, 2: {'marker': "o", 'dash': (None,None)}, 3: {'marker': None, 'dash': [1,3]}, 4: {'marker': "s", 'dash': [5,2,5,2,5,10]}, 5: {'marker': None, 'dash': [5,3,1,2,1,10]}, 6: {'marker': 'o', 'dash': (None,None)}, 7: {'marker': None, 'dash': [5,3,1,3]}, 8: {'marker': "1", 'dash': [1,3]}, 9: {'marker': "*", 'dash': [5,5]}, 10: {'marker': "2", 'dash': [5,2,5,2,5,10]}, 11: {'marker': "s", 'dash': (None,None)} } HATCHES = { 0: {'color': '#dfdfdf', 'hatch':"/"}, 1: {'color': '#6f6f6f', 'hatch':"\\"}, 2: {'color': 'b', 'hatch':"|"}, 3: {'color': '#dfdfdf', 'hatch':"-"}, 4: {'color': '#6f6f6f', 'hatch':"+"}, 5: {'color': 'b', 'hatch':"x"} } if black_and_white: if kind == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kind == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs['colormap'] = new_cmap class dummy_context_mgr(): """a fake context for plotting without style perhaps made obsolete by 'classic' style in new mpl""" def __enter__(self): return None def __exit__(self, one, two, three): return False with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr(): if not sbplt: # check if negative values, no stacked if so if areamode: kwargs['legend'] = False if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False ax = dataframe.plot(figsize = figsize, **kwargs) if areamode: handles, labels = plt.gca().get_legend_handles_labels() del handles del labels else: plt.gcf().set_tight_layout(False) if not piemode: ax = dataframe.plot(figsize = figsize, **kwargs) else: ax = dataframe.plot(figsize = figsize, **kwargs) handles, labels = plt.gca().get_legend_handles_labels() plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1), bbox_transform = plt.gcf().transFigure ) # this line allows layouts with missing plots # i.e. layout = (5, 2) with only nine plots plt.gcf().set_tight_layout(False) if 'rot' in kwargs: if kwargs['rot'] != 0 and kwargs['rot'] != 90: labels = [item.get_text() for item in ax.get_xticklabels()] ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white: if kind == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(list(COLORMAP.keys())): c = 0 # draw legend with proper placement etc if legend: if not piemode and not sbplt: if 3 not in interactive_types: handles, labels = plt.gca().get_legend_handles_labels() # area doubles the handles and labels. this removes half: if areamode: handles = handles[-len(handles) / 2:] labels = labels[-len(labels) / 2:] if rev_leg: handles = handles[::-1] labels = labels[::-1] lgd = plt.legend(handles, labels, **leg_options) if interactive: # 1 = highlight lines # 2 = line labels # 3 = legend switches ax = plt.gca() # fails for piemode lines = ax.lines handles, labels = plt.gca().get_legend_handles_labels() if 1 in interactive_types: plugins.connect(plt.gcf(), HighlightLines(lines)) if 3 in interactive_types: plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0)) for i, l in enumerate(lines): y_vals = l.get_ydata() x_vals = l.get_xdata() x_vals = [str(x) for x in x_vals] if absolutes: ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] else: ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] if 2 in interactive_types: #if 'kind' in kwargs and kwargs['kind'] == 'area': tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i]) mpld3.plugins.connect(plt.gcf(), tooltip_line) #else: if kind == 'line': tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls) mpld3.plugins.connect(plt.gcf(), tooltip_point) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' if x_label is not False: if type(x_label) == str: plt.xlabel(x_label) else: check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count) try: if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' check_x_axis = int(check_x_axis) if 1500 < check_x_axis < 2050: x_label = 'Year' else: x_label = 'Group' except: x_label = 'Group' if not sbplt: if not piemode: plt.xlabel(x_label) def is_number(s): """check if str can be can be made into float/int""" try: float(s) # for int, long and float except ValueError: try: complex(s) # for complex except ValueError: return False return True # for now, always turn off sci notation from matplotlib.ticker import ScalarFormatter if type(dataframe.index) != pandas.tseries.period.PeriodIndex: try: if all(is_number(s) for s in list(dataframe.index)): plt.gca().xaxis.set_major_formatter(ScalarFormatter()) except: pass try: if all(is_number(s) for s in list(dataframe.columns)): plt.gca().yaxis.set_major_formatter(ScalarFormatter()) except: pass # y labelling y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' def suplabel(axis,label,label_prop=None, labelpad=5, ha='center',va='center'): ''' Add super ylabel or xlabel to the figure Similar to matplotlib.suptitle axis - string: "x" or "y" label - string label_prop - keyword dictionary for Text labelpad - padding from the axis (default: 5) ha - horizontal alignment (default: "center") va - vertical alignment (default: "center") ''' fig = plt.gcf() xmin = [] ymin = [] for ax in fig.axes: xmin.append(ax.get_position().xmin) ymin.append(ax.get_position().ymin) xmin,ymin = min(xmin),min(ymin) dpi = fig.dpi if axis.lower() == "y": rotation=90. x = xmin-float(labelpad)/dpi y = 0.5 elif axis.lower() == 'x': rotation = 0. x = 0.5 y = ymin - float(labelpad)/dpi else: raise Exception("Unexpected axis: x or y") if label_prop is None: label_prop = dict() plt.gcf().text(x,y,label,rotation=rotation, transform=fig.transFigure, ha=ha,va=va, **label_prop) if y_label is not False: if not sbplt: if not piemode: if type(y_label) == str: plt.ylabel(y_label) else: plt.ylabel(y_l) else: if type(y_label) == str: the_y = y_label else: the_y = y_l #suplabel('y', the_y, labelpad = 1.5) plt.gcf().text(0.04, 0.5, the_y, va='center', rotation='vertical') #plt.subplots_adjust(left=0.5) # if not piemode: # if type(y_label) == str: # plt.ylabel(y_label) # else: # plt.ylabel(y_l) # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.gca().suptitle(title, fontsize = 16) #plt.subplots_adjust(top=0.9) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') # show grid a.grid(b=kwargs.get('grid', False)) kwargs.pop('grid', None) # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: if kind.startswith('bar'): width = ax.containers[0][0].get_width() # show grid ax.grid(b=kwargs.get('grid', False)) kwargs.pop('grid', None) if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0,the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') plt.subplots_adjust(left=0.1) plt.subplots_adjust(bottom=0.18) if 'layout' not in kwargs: if not sbplt: plt.tight_layout() if save: import os if running_python_tex: imagefolder = '../images' else: imagefolder = 'images' savename = get_savename(imagefolder, save = save, title = title, ext = output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o'): plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format) else: plt.gcf().savefig(savename, dpi=150, format = output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print('\n' + time + ": " + savename + " created.") else: raise ValueError("Error making %s." % savename) if dragmode: plt.legend().draggable() if sbplt: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) if not interactive and not running_python_tex and not running_spider \ and not tk: plt.gcf().show() return elif running_spider or tk: return plt if interactive: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) try: ax.legend_.remove() except: pass return mpld3.display()
def plotter(title, df, x_label = None, y_label = None, style = 'ggplot', figsize = (8, 4), save = False, legend_pos = 'best', reverse_legend = 'guess', num_to_plot = 7, tex = 'try', colours = 'Paired', cumulative = False, pie_legend = True, partial_pie = False, show_totals = False, transparent = False, output_format = 'png', interactive = False, black_and_white = False, show_p_val = False, indices = 'guess', **kwargs): """plot interrogator() or editor() output. **kwargs are for pandas first, which can then send them through to matplotlib.plot(): http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.plot.html http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot pie_legend: False to label slices rather than give legend show_totals: where to show percent/abs frequencies: False, 'plot', 'legend', or 'both' """ import corpkit import os import matplotlib as mpl if interactive: import matplotlib.pyplot as plt, mpld3 else: import matplotlib.pyplot as plt from matplotlib import rc import pandas import pandas as pd from pandas import DataFrame import numpy from time import localtime, strftime from corpkit.tests import check_pytex, check_spider, check_t_kinter if interactive: import mpld3 import collections from mpld3 import plugins, utils from plugins import InteractiveLegendPlugin, HighlightLines tk = check_t_kinter() running_python_tex = check_pytex() # incorrect spelling of spider on purpose running_spider = check_spider() def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100): """remove extreme values from colourmap --- no pure white""" import matplotlib.colors as colors import numpy as np new_cmap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))) return new_cmap def get_savename(imagefolder, save = False, title = False, ext = 'png'): """Come up with the savename for the image.""" import os def urlify(s): "Turn title into filename" import re s = s.lower() s = re.sub(r"[^\w\s-]", '', s) s = re.sub(r"\s+", '-', s) s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s) return s # name as if not ext.startswith('.'): ext = '.' + ext if type(save) == str: savename = os.path.join(imagefolder, (urlify(save) + ext)) #this 'else' is redundant now that title is obligatory else: if title: filename = urlify(title) + ext savename = os.path.join(imagefolder, filename) # remove duplicated ext if savename.endswith('%s%s' % (ext, ext)): savename = savename.replace('%s%s' % (ext, ext), ext, 1) return savename def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True): """adds totals (abs, rel, keyness) to entry name strings""" if was_series: where_the_words_are = dataframe.index else: where_the_words_are = dataframe.columns the_labs = [] for w in list(where_the_words_are): if not absolutes: if was_series: perc = dataframe.T[w][0] else: the_labs.append(w) continue if using_tex: the_labs.append('%s (%.2f\%%)' % (w, perc)) else: the_labs.append('%s (%.2f %%)' % (w, perc)) else: if was_series: score = dataframe.T[w].sum() else: score = dataframe[w].sum() if using_tex: the_labs.append('%s (n=%d)' % (w, score)) else: the_labs.append('%s (n=%d)' % (w, score)) if not was_series: dataframe.columns = the_labs else: vals = list(dataframe[list(dataframe.columns)[0]].values) dataframe = pd.DataFrame(vals, index = the_labs) dataframe.columns = ['Total'] return dataframe def auto_explode(dataframe, input, was_series = False, num_to_plot = 7): """give me a list of strings and i'll output explode option""" output = [0 for s in range(num_to_plot)] if was_series: l = list(dataframe.index) else: l = list(dataframe.columns) if type(input) == str or type(input) == int: input = [input] if type(input) == list: for i in input: if type(i) == str: index = l.index(i) else: index = i output[index] = 0.1 return output # are we doing subplots? sbplt = False if 'subplots' in kwargs: if kwargs['subplots'] is True: sbplt = True if colours is True: colours = 'Paired' styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight'] if style not in styles: raise ValueError('Style %s not found. Use %s' % (style, ', '.join(styles))) if 'savepath' in kwargs.keys(): mpl.rcParams['savefig.directory'] = kwargs['savepath'] del kwargs['savepath'] mpl.rcParams['savefig.bbox'] = 'tight' # try to use tex # TO DO: # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.latex.unicode'] = True if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if interactive: using_tex = False if show_totals is False: show_totals = 'none' # find out what kind of plot we're making, and enable # or disable interactive values if need be if 'kind' not in kwargs: kwargs['kind'] = 'line' if interactive: if kwargs['kind'].startswith('bar'): interactive_types = [3] elif kwargs['kind'] == 'area': interactive_types = [2, 3] elif kwargs['kind'] == 'line': interactive_types = [2, 3] elif kwargs['kind'] == 'pie': interactive_types = None warnings.warn('Interactive plotting not yet available for pie plots.') else: interactive_types = [None] if interactive is False: interactive_types = [None] # find out if pie mode, add autopct format piemode = False if 'kind' in kwargs: if kwargs['kind'] == 'pie': piemode = True # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' #if piemode: #if partial_pie: #kwargs['startangle'] = 180 kwargs['subplots'] = sbplt # copy data, make series into df dataframe = df.copy() was_series = False if type(dataframe) == pandas.core.series.Series: was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # attempt to convert x axis to ints: try: dataframe.index = [int(i) for i in list(dataframe.index)] except: pass # remove totals and tkinter order if not was_series: for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): dataframe = dataframe.drop(name, axis = ax, errors = 'ignore') else: dataframe = dataframe.drop('tkintertable-order', errors = 'ignore') dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore') # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': def isint(x): try: a = float(x) b = int(a) except ValueError or OverflowError: return False else: return a == b if all([isint(x) is True for x in list(dataframe.columns)]): indices = True else: indices = False # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf'] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if 'explode' in kwargs: if not piemode: del kwargs['explode'] if piemode: if 'explode' in kwargs: if not sbplt: kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series = was_series, num_to_plot = num_to_plot) if 'legend' in kwargs: legend = kwargs['legend'] else: legend = True #cut data short plotting_a_totals_column = False if was_series: if list(dataframe.columns)[0] != 'Total': try: can_be_ints = [int(x) for x in list(dataframe.index)] num_to_plot = len(dataframe) except: dataframe = dataframe[:num_to_plot] elif list(dataframe.columns)[0] == 'Total': plotting_a_totals_column = True if not 'legend' in kwargs: legend = False num_to_plot = len(dataframe) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis = 1) except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val: if there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] newname = '%s (p=%s)' % (col, format(pval, '.5f')) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis = 0, inplace = True) else: warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.') else: if there_are_p_vals: dataframe.drop(statfields, axis = 0, inplace = True) # make and set y label absolutes = True if type(dataframe) == pandas.core.frame.DataFrame: try: if not all([s.is_integer() for s in dataframe.iloc[0,:].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False # use colormap if need be: if num_to_plot > 0: if not was_series: if 'kind' in kwargs: if kwargs['kind'] in ['pie', 'line', 'area']: if colours: if not plotting_a_totals_column: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours #else: if colours: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours if piemode: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours else: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours #else: #if len(dataframe.T.columns) < 8: #try: #del kwargs['colormap'] #except: #pass # multicoloured bar charts if 'kind' in kwargs: if colours: if kwargs['kind'].startswith('bar'): if len(list(dataframe.columns)) == 1: if not black_and_white: import numpy as np the_range = np.linspace(0, 1, num_to_plot) cmap = plt.get_cmap(colours) kwargs['colors'] = [cmap(n) for n in the_range] # make a bar width ... ? #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5 # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if 'kind' in kwargs: if kwargs['kind'] in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kwargs['kind'] == 'pie': if pie_legend: legend = True else: legend = False if kwargs['kind'] in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # cut dataframe if just_totals try: tst = dataframe['Combined total'] dataframe = dataframe.head(num_to_plot) except: pass # rotate automatically if 'rot' not in kwargs: if not was_series: xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]] #if 'kind' in kwargs: #if kwargs['kind'] in ['barh', 'area']: #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] else: xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] if len(max(xvals, key=len)) > 6: if not piemode: kwargs['rot'] = 45 # no title for subplots because ugly, if sbplt: if 'title' in kwargs: del kwargs['title'] else: kwargs['title'] = title # no interactive subplots yet: if sbplt and interactive: import warnings interactive = False warnings.warn('No interactive subplots yet, sorry.') return # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: # kwarg options go in leg_options leg_options = {'framealpha': .8} if 'shadow' in kwargs: leg_options['shadow'] = True if 'ncol' in kwargs: leg_options['ncol'] = kwargs['ncol'] del kwargs['ncol'] else: if num_to_plot > 6: leg_options['ncol'] = num_to_plot / 7 # determine legend position based on this dict if legend_pos: possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left'} if type(legend_pos) == int: the_loc = legend_pos elif type(legend_pos) == str: try: the_loc = possible[legend_pos] except KeyError: raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(possible.keys())) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if type(legend_pos) == str: if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) if piemode: if partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode: if not sbplt: kwargs['y'] = list(dataframe.columns)[0] if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.columns) else: if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.index) areamode = False if 'kind' in kwargs: if kwargs['kind'] == 'area': areamode = True if legend is False: kwargs['legend'] = False # cumulative grab first col if cumulative: kwargs['y'] = list(dataframe.columns)[0] # line highlighting option for interactive! if interactive: if 2 in interactive_types: if kwargs['kind'] == 'line': kwargs['marker'] = ',' if not piemode: kwargs['alpha'] = 0.1 # convert dates --- works only in my current case! if plotting_a_totals_column or not was_series: try: can_it_be_int = int(list(dataframe.index)[0]) can_be_int = True except: can_be_int = False if can_be_int: if 1500 < int(list(dataframe.index)[0]): if 2050 > int(list(dataframe.index)[0]): n = pd.PeriodIndex([d for d in list(dataframe.index)], freq='A') dataframe = dataframe.set_index(n) MARKERSIZE = 4 COLORMAP = { 0: {'marker': None, 'dash': (None,None)}, 1: {'marker': None, 'dash': [5,5]}, 2: {'marker': "o", 'dash': (None,None)}, 3: {'marker': None, 'dash': [1,3]}, 4: {'marker': "s", 'dash': [5,2,5,2,5,10]}, 5: {'marker': None, 'dash': [5,3,1,2,1,10]}, 6: {'marker': 'o', 'dash': (None,None)}, 7: {'marker': None, 'dash': [5,3,1,3]}, 8: {'marker': "1", 'dash': [1,3]}, 9: {'marker': "*", 'dash': [5,5]}, 10: {'marker': "2", 'dash': [5,2,5,2,5,10]}, 11: {'marker': "s", 'dash': (None,None)} } HATCHES = { 0: {'color': '#dfdfdf', 'hatch':"/"}, 1: {'color': '#6f6f6f', 'hatch':"\\"}, 2: {'color': 'b', 'hatch':"|"}, 3: {'color': '#dfdfdf', 'hatch':"-"}, 4: {'color': '#6f6f6f', 'hatch':"+"}, 5: {'color': 'b', 'hatch':"x"} } if black_and_white: if kwargs['kind'] == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kwargs['kind'] == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs['colormap'] = new_cmap # use styles and plot with plt.style.context((style)): if not sbplt: # check if negative values, no stacked if so if areamode: if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False ax = dataframe.plot(figsize = figsize, **kwargs) else: if not piemode and not sbplt: ax = dataframe.plot(figsize = figsize, **kwargs) else: ax = dataframe.plot(figsize = figsize, **kwargs) handles, labels = plt.gca().get_legend_handles_labels() plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1), bbox_transform = plt.gcf().transFigure ) if not tk: plt.show() return if 'rot' in kwargs: if kwargs['rot'] != 0 and kwargs['rot'] != 90: labels = [item.get_text() for item in ax.get_xticklabels()] ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white: #plt.grid() plt.gca().set_axis_bgcolor('w') if kwargs['kind'] == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(COLORMAP.keys()): c = 0 if legend: if not piemode and not sbplt: if 3 not in interactive_types: if not rev_leg: lgd = plt.legend(**leg_options) else: handles, labels = plt.gca().get_legend_handles_labels() lgd = plt.legend(handles[::-1], labels[::-1], **leg_options) #if black_and_white: #lgd.set_facecolor('w') #if interactive: #if legend: #lgd.set_title("") #if not sbplt: #if 'layout' not in kwargs: #plt.tight_layout() if interactive: # 1 = highlight lines # 2 = line labels # 3 = legend switches ax = plt.gca() # fails for piemode lines = ax.lines handles, labels = plt.gca().get_legend_handles_labels() if 1 in interactive_types: plugins.connect(plt.gcf(), HighlightLines(lines)) if 3 in interactive_types: plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0)) for i, l in enumerate(lines): y_vals = l.get_ydata() x_vals = l.get_xdata() x_vals = [str(x) for x in x_vals] if absolutes: ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] else: ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] if 2 in interactive_types: #if 'kind' in kwargs and kwargs['kind'] == 'area': tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i]) mpld3.plugins.connect(plt.gcf(), tooltip_line) #else: if kwargs['kind'] == 'line': tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls) mpld3.plugins.connect(plt.gcf(), tooltip_point) # works: #plugins.connect(plt.gcf(), plugins.LineLabelTooltip(l, labels[i])) #labels = ["Point {0}".format(i) for i in range(num_to_plot)] #tooltip = plugins.LineLabelTooltip(lines) #mpld3.plugins.connect(plt.gcf(), mpld3.plugins.PointLabelTooltip(lines)) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' if x_label is not False: if type(x_label) == str: plt.xlabel(x_label) else: check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count) try: if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' check_x_axis = int(check_x_axis) if 1500 < check_x_axis < 2050: x_label = 'Year' else: x_label = 'Group' except: x_label = 'Group' if not sbplt: if not piemode: plt.xlabel(x_label) # no offsets for numerical x and y values if type(dataframe.index) != pandas.tseries.period.PeriodIndex: try: # check if x axis can be an int check_x_axis = list(dataframe.index)[0] can_it_be_int = int(check_x_axis) # if so, set these things from matplotlib.ticker import ScalarFormatter plt.gca().xaxis.set_major_formatter(ScalarFormatter()) except: pass # same for y axis try: # check if x axis can be an int check_y_axis = list(dataframe.columns)[0] can_it_be_int = int(check_y_axis) # if so, set these things from matplotlib.ticker import ScalarFormatter plt.gca().yaxis.set_major_formatter(ScalarFormatter()) except: pass # y labelling y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' if y_label is not False: if not sbplt: if not piemode: if type(y_label) == str: plt.ylabel(y_label) else: plt.ylabel(y_l) # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.suptitle(title, fontsize = 16) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: if 'kind' in kwargs: if kwargs['kind'].startswith('bar'): width = ax.containers[0][0].get_width() if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0,the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') #if not running_python_tex: #plt.gcf().show() plt.subplots_adjust(left=0.1) plt.subplots_adjust(bottom=0.18) #if 'layout' not in kwargs: #plt.tight_layout() if save: import os if running_python_tex: imagefolder = '../images' else: imagefolder = 'images' savename = get_savename(imagefolder, save = save, title = title, ext = output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o'): plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format) else: plt.gcf().savefig(savename, dpi=150, format = output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print '\n' + time + ": " + savename + " created." else: raise ValueError("Error making %s." % savename) if not interactive and not running_python_tex and not running_spider and not tk: plt.show() return if running_spider or tk or sbplt: return plt if interactive: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) try: ax.legend_.remove() except: pass return mpld3.display()
# -*- coding:utf-8 -*- import numpy as np from pandas import Series, DataFrame print('求和') df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) print(df) print(df.sum()) #按列求和 print(df.sum(axis=1)) # 按行求和 print('平均数') print(df.mean(axis=1, skipna=False)) print(df.mean(axis=1)) print('其它') print(df.idxmax()) print(df.cumsum()) print(df.describe()) obj = Series(['a', 'a', 'b', 'c'] * 4) print(obj.describe())
obj = Series(range(5),index=['a','a','b','b','c']) obj obj.index.is_unique obj['a'] obj['c'] df = DataFrame(np.random.randn(4,3),index=['a','a','b','b']) df df.ix['b'] df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two']) df df.sum() df.sum(axis=1) df.mean(axis=1,skipna=False) df.mean(axis=1) df.idxmax() df.cumsum() df.cumsum(axis=1) df.describe() obj = Series(['a','a','b','c'] * 4) obj.describe() obj %run Dataframe.py from pandas_datareader import data all_data = {} for ticker in ['AAPL','IBM','MSFT','GOOG']: all_data[ticker] = data.get_data_google(ticker,'1/1/2000','1/1/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteriterms()}) price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) all_data.iteritems() a,b in for a, b in all_data.iteritems() a,b for a, b in all_data.iteritems()