def predictionRatio(df, metric="Levenshtein"): #Generate all possible combinations for string matching soc_media_1, soc_media_2 = df.columns # Convert everything to lower case df[soc_media_1] = df[soc_media_1].str.lower() df[soc_media_2] = df[soc_media_2].str.lower() df_known = DataFrame([df[soc_media_1].tolist()] * df.shape[0], index=df.index, columns=df.index) df_search = DataFrame([df[soc_media_2].tolist()] * df.shape[0], index=df.index, columns=df.index) df_known_list = df_known.applymap(lambda x: list([x])) df_search_list = df_search.applymap(lambda x: list([x])) df_search_list = df_known_list+df_search_list.T # Find the indices of columns for each row based on metric # For Levenshtein get the min., for JaroWinkler get the max. if metric == 'Levenshtein': search_res = df_search_list.applymap(lambda x: Levenshtein.distance(x[0], x[1])) indices = search_res.idxmin(axis=1) else: search_res = df_search_list.applymap(lambda x: Levenshtein.jaro_winkler(x[0], x[1])) indices = search_res.idxmax(axis=1) # Get the matches for social media account match = df[soc_media_2].ix[indices] df_t = DataFrame() df_t['actual'] = df[soc_media_2].reset_index(drop=True) df_t['match'] = match.reset_index(drop=True) # Find the ratio of correct matches match_count = (df_t.actual == df_t.match).value_counts() ratio = float(match_count[True]) / (match_count[True] + match_count[False]) return ratio
def test_applymap(self): applied = self.frame.applymap(lambda x: x * 2) assert_frame_equal(applied, self.frame * 2) result = self.frame.applymap(type) # GH #465, function returning tuples result = self.frame.applymap(lambda x: (x, x)) tm.assertIsInstance(result['A'][0], tuple) # GH 2909, object conversion to float in constructor? df = DataFrame(data=[1, 'a']) result = df.applymap(lambda x: x) self.assertEqual(result.dtypes[0], object) df = DataFrame(data=[1., 'a']) result = df.applymap(lambda x: x) self.assertEqual(result.dtypes[0], object) # GH2786 df = DataFrame(np.random.random((3, 4))) df2 = df.copy() cols = ['a', 'a', 'a', 'a'] df.columns = cols expected = df2.applymap(str) expected.columns = cols result = df.applymap(str) assert_frame_equal(result, expected) # datetime/timedelta df['datetime'] = Timestamp('20130101') df['timedelta'] = pd.Timedelta('1 min') result = df.applymap(str) for f in ['datetime', 'timedelta']: self.assertEqual(result.loc[0, f], str(df.loc[0, f]))
def applyDataFrame(): df = DataFrame(np.arange(12).reshape(4,3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) print (df) f = lambda x: x.max() - x.min() func1 = df.apply(f, axis = 0) func2 = df.apply(f, axis = 1) print (func1) print (func2) f2 = lambda x: '%.2f' % x df.applymap(f2) print (df.applymap(f2))
def pd_03(): frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Ohio', 'Colorado', 'Utah', 'New York']) print frame print np.abs(frame) f=lambda x:x.max()-x.min() print frame.apply(f) print frame.apply(f,axis=1) def f(x): return Series([x.min(),x.max()],index=['min','max']) print frame.apply(f) format=lambda x:'%.2f' % x print frame.applymap(format)
def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: dt_index = self.tsframe.index datetime_frame = DataFrame( {'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) datetime_frame.to_csv(path, date_format='%Y%m%d') # Check that the data was put in the specified format test = read_csv(path, index_col=0) datetime_frame_int = datetime_frame.applymap( lambda x: int(x.strftime('%Y%m%d'))) datetime_frame_int.index = datetime_frame_int.index.map( lambda x: int(x.strftime('%Y%m%d'))) assert_frame_equal(test, datetime_frame_int) datetime_frame.to_csv(path, date_format='%Y-%m-%d') # Check that the data was put in the specified format test = read_csv(path, index_col=0) datetime_frame_str = datetime_frame.applymap( lambda x: x.strftime('%Y-%m-%d')) datetime_frame_str.index = datetime_frame_str.index.map( lambda x: x.strftime('%Y-%m-%d')) assert_frame_equal(test, datetime_frame_str) # Check that columns get converted datetime_frame_columns = datetime_frame.T datetime_frame_columns.to_csv(path, date_format='%Y%m%d') test = read_csv(path, index_col=0) datetime_frame_columns = datetime_frame_columns.applymap( lambda x: int(x.strftime('%Y%m%d'))) # Columns don't get converted to ints by read_csv datetime_frame_columns.columns = ( datetime_frame_columns.columns .map(lambda x: x.strftime('%Y%m%d'))) assert_frame_equal(test, datetime_frame_columns) # test NaTs nat_index = to_datetime( ['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) nat_frame = DataFrame({'A': nat_index}, index=nat_index) nat_frame.to_csv(path, date_format='%Y-%m-%d') test = read_csv(path, parse_dates=[0, 1], index_col=0) assert_frame_equal(test, nat_frame)
def test_applymap(self): applied = self.frame.applymap(lambda x: x * 2) assert_frame_equal(applied, self.frame * 2) result = self.frame.applymap(type) # GH #465, function returning tuples result = self.frame.applymap(lambda x: (x, x)) tm.assertIsInstance(result["A"][0], tuple) # GH 2909, object conversion to float in constructor? df = DataFrame(data=[1, "a"]) result = df.applymap(lambda x: x) self.assertEqual(result.dtypes[0], object) df = DataFrame(data=[1.0, "a"]) result = df.applymap(lambda x: x) self.assertEqual(result.dtypes[0], object) # GH2786 df = DataFrame(np.random.random((3, 4))) df2 = df.copy() cols = ["a", "a", "a", "a"] df.columns = cols expected = df2.applymap(str) expected.columns = cols result = df.applymap(str) assert_frame_equal(result, expected) # datetime/timedelta df["datetime"] = Timestamp("20130101") df["timedelta"] = pd.Timedelta("1 min") result = df.applymap(str) for f in ["datetime", "timedelta"]: self.assertEqual(result.loc[0, f], str(df.loc[0, f])) # GH 8222 empty_frames = [ pd.DataFrame(), pd.DataFrame(columns=list("ABC")), pd.DataFrame(index=list("ABC")), pd.DataFrame({"A": [], "B": [], "C": []}), ] for frame in empty_frames: for func in [round, lambda x: x]: result = frame.applymap(func) tm.assert_frame_equal(result, frame)
def test_frame_apply_dont_convert_datetime64(self): from pandas.tseries.offsets import BDay df = DataFrame({'x1': [datetime(1996, 1, 1)]}) df = df.applymap(lambda x: x + BDay()) df = df.applymap(lambda x: x + BDay()) assert df.x1.dtype == 'M8[ns]'
def test_applymap(self, float_frame): applied = float_frame.applymap(lambda x: x * 2) tm.assert_frame_equal(applied, float_frame * 2) float_frame.applymap(type) # GH 465: function returning tuples result = float_frame.applymap(lambda x: (x, x)) assert isinstance(result['A'][0], tuple) # GH 2909: object conversion to float in constructor? df = DataFrame(data=[1, 'a']) result = df.applymap(lambda x: x) assert result.dtypes[0] == object df = DataFrame(data=[1., 'a']) result = df.applymap(lambda x: x) assert result.dtypes[0] == object # GH 2786 df = DataFrame(np.random.random((3, 4))) df2 = df.copy() cols = ['a', 'a', 'a', 'a'] df.columns = cols expected = df2.applymap(str) expected.columns = cols result = df.applymap(str) tm.assert_frame_equal(result, expected) # datetime/timedelta df['datetime'] = Timestamp('20130101') df['timedelta'] = pd.Timedelta('1 min') result = df.applymap(str) for f in ['datetime', 'timedelta']: assert result.loc[0, f] == str(df.loc[0, f]) # GH 8222 empty_frames = [pd.DataFrame(), pd.DataFrame(columns=list('ABC')), pd.DataFrame(index=list('ABC')), pd.DataFrame({'A': [], 'B': [], 'C': []})] for frame in empty_frames: for func in [round, lambda x: x]: result = frame.applymap(func) tm.assert_frame_equal(result, frame)
def setup(self, sep, thousands): N = 10000 K = 8 data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) df = DataFrame(data) if thousands is not None: fmt = ':{}'.format(thousands) fmt = '{' + fmt + '}' df = df.applymap(lambda x: fmt.format(x)) df.to_csv(self.fname, sep=sep)
def test(): frame = DataFrame(numpy.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) format = lambda x: '%.2f' % x range = lambda x: x.max() - x.min() # http://stackoverflow.com/questions/19798153/difference-between-map-applymap-and-apply-methods-in-pandas/19798528#19798528 print(frame.apply(range)) print("") print(frame.applymap(format)) print("") print(frame.apply(range).map(format)) return frame
def compare_panel_lengths(self, panels, reference_label="GAL_Completo"): ref_count = len(Panel(reference_label).rs_ids) comparison = DataFrame({}) for panel in panels: s = Series({"AIMs count": len(panel.rs_ids)}, name=panel.label) comparison = comparison.append(s) comparison = comparison.applymap(int) comparison.sort_index(ascending=False, inplace=True) comparison["Ratio"] = comparison["AIMs count"] / ref_count comparison["Ratio"] = comparison["Ratio"].map(lambda x: round(x, 2)) comparison["AIMs count"] = comparison["AIMs count"].map(thousands_separator) comparison.index.name = "Panel" return comparison
def free_energy(dist: pd.DataFrame, kbt: float) -> pd.DataFrame: """ Compute the free energy from a probability distribution. Parameters ---------- dist : Probability distribution. kbt : k_b * T as calculated by plumed. Returns ------- free_energy : Free energy of probability distribution as a Dataframe. """ return dist.applymap(lambda p: -kbt * np.log(p) if p != 0 else float('inf'))
def normalize(df, rd_ref): # sample_name -> [rd_values] h = defaultdict(lambda: []) for sample_name in df.columns: avg = np.average(df[sample_name]) a_rd = df[sample_name] new_values = a_rd*(rd_ref/avg) h[sample_name] = new_values new_df = DataFrame(h, index=df.index.values) def logic(v): if log == 0: return 0.0001 else: return v return new_df.applymap(logic)
def gs(str,list): s = list t= pd.read_csv(str,usecols= s) w=DataFrame(t) try: plt.scatter(w[s[0]],w[s[1]],color='red') plt.show() except: pass try: w.hist() plt.show() w.plot(kind='box',by=list) plt.show() except: pass t=w.applymap(np.isreal) print t b= ''.join(s) for i in t[b]: if i==False: a=(w[b].value_counts()) a.plot(kind='bar') plt.show() break
# -*- coding: utf-8 -*- """""""""""""""""""""""""""""""""""""""""""""""""""""""""" 函数映射 """""""""""""""""""""""""""""""""""""""""""""""""""""""""" #%% import pandas as pd from pandas import Series,DataFrame from string import letters d=DataFrame({'a':['a','b'] * 5,'b':['pandas','Series','DataFrame','string','import'] * 2}) #%% 使用序列的map映射一个处理函数 d['b'].map(str.upper) #%% 如果对应的函数需要额外参数 d['b'].map(lambda x:x.replace('a','*')) #%% 通过字典做值替换 d['a'].map({'a':1,'b':2}) #%% #如果给值不全会变成空 d['a'].map({'a':1}) #%% 对数据框可以用applymap d.applymap(len) #%%
data = { 'one': Series([1, 2, 3], index=['a', 'b', 'c']), 'two': Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) } if False: df = DataFrame(data) print df print df.apply(np.mean) # Lambada if False: df = DataFrame(data) print df print df['one'].map(lambda x: x > 1) print df.applymap(lambda x: x > 1) ''' You might find using "boolean indexing" helpful for this problem. Here is a link to the pandas documentation. Here's also an excellent series of tutorials as IPython notebooks ''' def avg_bronze_medal_count(): countries = [ 'Russian Fed.', 'Norway', 'Canada', 'United States', 'Netherlands', 'Germany', 'Switzerland', 'Belarus', 'Austria', 'France', 'Poland', 'China', 'Korea', 'Sweden', 'Czech Republic', 'Slovenia', 'Japan', 'Finland', 'Great Britain', 'Ukraine', 'Slovakia', 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan' ]
def plotter(df, title=False, kind='line', x_label=None, y_label=None, style='ggplot', figsize=(8, 4), save=False, legend_pos='best', reverse_legend='guess', num_to_plot=7, tex='try', colours='Accent', cumulative=False, pie_legend=True, partial_pie=False, show_totals=False, transparent=False, output_format='png', interactive=False, black_and_white=False, show_p_val=False, indices=False, transpose=False, **kwargs): """Visualise corpus interrogations. :param title: A title for the plot :type title: str :param df: Data to be plotted :type df: pandas.core.frame.DataFrame :param x_label: A label for the x axis :type x_label: str :param y_label: A label for the y axis :type y_label: str :param kind: The kind of chart to make :type kind: str ('line'/'bar'/'barh'/'pie'/'area') :param style: Visual theme of plot :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc) :param figsize: Size of plot :type figsize: tuple (int, int) :param save: If bool, save with *title* as name; if str, use str as name :type save: bool/str :param legend_pos: Where to place legend :type legend_pos: str ('upper right'/'outside right'/etc) :param reverse_legend: Reverse the order of the legend :type reverse_legend: bool :param num_to_plot: How many columns to plot :type num_to_plot: int/'all' :param tex: Use TeX to draw plot text :type tex: bool :param colours: Colourmap for lines/bars/slices :type colours: str :param cumulative: Plot values cumulatively :type cumulative: bool :param pie_legend: Show a legend for pie chart :type pie_legend: bool :param partial_pie: Allow plotting of pie slices only :type partial_pie: bool :param show_totals: Print sums in plot where possible :type show_totals: str -- 'legend'/'plot'/'both' :param transparent: Transparent .png background :type transparent: bool :param output_format: File format for saved image :type output_format: str -- 'png'/'pdf' :param black_and_white: Create black and white line styles :type black_and_white: bool :param show_p_val: Attempt to print p values in legend if contained in df :type show_p_val: bool :param indices: To use when plotting "distance from root" :type indices: bool :param stacked: When making bar chart, stack bars on top of one another :type stacked: str :param filled: For area and bar charts, make every column sum to 100 :type filled: str :param legend: Show a legend :type legend: bool :param rot: Rotate x axis ticks by *rot* degrees :type rot: int :param subplots: Plot each column separately :type subplots: bool :param layout: Grid shape to use when *subplots* is True :type layout: tuple -- (int, int) :param interactive: Experimental interactive options :type interactive: list -- [1, 2, 3] :returns: matplotlib figure """ import corpkit import os try: from IPython.utils.shimmodule import ShimWarning import warnings warnings.simplefilter('ignore', ShimWarning) except: pass import matplotlib as mpl from matplotlib import rc # prefer seaborn plotting try: import seaborn as sns except: pass if interactive: import matplotlib.pyplot as plt, mpld3 else: import matplotlib.pyplot as plt import pandas from pandas import DataFrame import numpy from time import localtime, strftime from tests import check_pytex, check_spider, check_t_kinter if interactive: import mpld3 import collections from mpld3 import plugins, utils from plugins import InteractiveLegendPlugin, HighlightLines # check what environment we're in tk = check_t_kinter() running_python_tex = check_pytex() running_spider = check_spider() if not title: title = '' def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100): """remove extreme values from colourmap --- no pure white""" import matplotlib.colors as colors import numpy as np new_cmap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))) return new_cmap def get_savename(imagefolder, save=False, title=False, ext='png'): """Come up with the savename for the image.""" import os def urlify(s): "Turn title into filename" import re s = s.lower() s = re.sub(r"[^\w\s-]", '', s) s = re.sub(r"\s+", '-', s) s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s) return s # name as if not ext.startswith('.'): ext = '.' + ext if type(save) == str: savename = os.path.join(imagefolder, (urlify(save) + ext)) #this 'else' is redundant now that title is obligatory else: if title: filename = urlify(title) + ext savename = os.path.join(imagefolder, filename) # remove duplicated ext if savename.endswith('%s%s' % (ext, ext)): savename = savename.replace('%s%s' % (ext, ext), ext, 1) return savename def rename_data_with_total(dataframe, was_series=False, using_tex=False, absolutes=True): """adds totals (abs, rel, keyness) to entry name strings""" if was_series: where_the_words_are = dataframe.index else: where_the_words_are = dataframe.columns the_labs = [] for w in list(where_the_words_are): if not absolutes: if was_series: perc = dataframe.T[w][0] else: the_labs.append(w) continue if using_tex: the_labs.append('%s (%.2f\%%)' % (w, perc)) else: the_labs.append('%s (%.2f %%)' % (w, perc)) else: if was_series: score = dataframe.T[w].sum() else: score = dataframe[w].sum() if using_tex: the_labs.append('%s (n=%d)' % (w, score)) else: the_labs.append('%s (n=%d)' % (w, score)) if not was_series: dataframe.columns = the_labs else: vals = list(dataframe[list(dataframe.columns)[0]].values) dataframe = pandas.DataFrame(vals, index=the_labs) dataframe.columns = ['Total'] return dataframe def auto_explode(dataframe, input, was_series=False, num_to_plot=7): """give me a list of strings and i'll output explode option""" output = [0 for s in range(num_to_plot)] if was_series: l = list(dataframe.index) else: l = list(dataframe.columns) if type(input) == str or type(input) == int: input = [input] if type(input) == list: for i in input: if type(i) == str: index = l.index(i) else: index = i output[index] = 0.1 return output # check if we're doing subplots sbplt = False if 'subplots' in kwargs: if kwargs['subplots'] is True: sbplt = True kwargs['subplots'] = sbplt if colours is True: colours = 'Paired' # todo: get this dynamically instead. styles = [ 'dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white' ] #if style not in styles: #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles))) if style == 'mpl-white': try: sns.set_style("whitegrid") except: pass style = 'matplotlib' if style is not False and style.startswith('seaborn'): colours = False # use 'draggable = True' to make a draggable legend dragmode = kwargs.pop('draggable', False) if kwargs.get('savepath'): mpl.rcParams['savefig.directory'] = kwargs.get('savepath') kwargs.pop('savepath', None) mpl.rcParams['savefig.bbox'] = 'tight' mpl.rcParams.update({'figure.autolayout': True}) # try to use tex # TO DO: # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.latex.unicode'] = True if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if interactive: using_tex = False if show_totals is False: show_totals = 'none' # find out what kind of plot we're making, and enable # or disable interactive values if need be kwargs['kind'] = kind.lower() if interactive: if kwargs['kind'].startswith('bar'): interactive_types = [3] elif kwargs['kind'] == 'area': interactive_types = [2, 3] elif kwargs['kind'] == 'line': interactive_types = [2, 3] elif kwargs['kind'] == 'pie': interactive_types = None warnings.warn( 'Interactive plotting not yet available for pie plots.') else: interactive_types = [None] if interactive is False: interactive_types = [None] # find out if pie mode, add autopct format piemode = False if kind == 'pie': piemode = True # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' # copy data, make series into df dataframe = df.copy() was_series = False if type(dataframe) == pandas.core.series.Series: was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if transpose: dataframe = dataframe.T if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # attempt to convert x axis to ints: try: dataframe.index = [int(i) for i in list(dataframe.index)] except: pass # remove totals and tkinter order if not was_series and not all(x.lower() == 'total' for x in list(dataframe.columns)): for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): try: dataframe = dataframe.drop(name, axis=ax, errors='ignore') except: pass else: dataframe = dataframe.drop('tkintertable-order', errors='ignore') dataframe = dataframe.drop('tkintertable-order', axis=1, errors='ignore') # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': def isint(x): try: a = float(x) b = int(a) except ValueError or OverflowError: return False else: return a == b if all([isint(x) is True for x in list(dataframe.columns)]): indices = True else: indices = False # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = [ 'svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf' ] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if piemode and not sbplt and kwargs.get('explode'): kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series=was_series, num_to_plot=num_to_plot) else: kwargs.pop('explode', None) legend = kwargs.get('legend', True) #cut data short plotting_a_totals_column = False if was_series: if list(dataframe.columns)[0] != 'Total': try: can_be_ints = [int(x) for x in list(dataframe.index)] num_to_plot = len(dataframe) except: dataframe = dataframe[:num_to_plot] elif list(dataframe.columns)[0] == 'Total': plotting_a_totals_column = True if not 'legend' in kwargs: legend = False num_to_plot = len(dataframe) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis=1, errors='ignore') except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val: if there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] def p_string_formatter(val): if val < 0.001: if not using_tex: return 'p < 0.001' else: return r'p $<$ 0.001' else: return 'p = %s' % format(val, '.3f') pstr = p_string_formatter(pval) newname = '%s (%s)' % (col, pstr) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis=0, inplace=True, errors='ignore') else: warnings.warn( 'No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.' ) else: if there_are_p_vals: dataframe.drop(statfields, axis=0, inplace=True, errors='ignore') # make and set y label absolutes = True if type(dataframe) == pandas.core.frame.DataFrame: try: if not all([s.is_integer() for s in dataframe.iloc[0, :].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False # use colormap if need be: if num_to_plot > 0: if not was_series: if kind in ['pie', 'line', 'area']: if colours: if not plotting_a_totals_column: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours #else: if colours: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours if piemode: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours else: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours # multicoloured bar charts if colours: if kind.startswith('bar'): if len(list(dataframe.columns)) == 1: if not black_and_white: import numpy as np the_range = np.linspace(0, 1, num_to_plot) middle = len(the_range) / 2 cmap = plt.get_cmap(colours) kwargs['color'] = [cmap(n) for n in the_range][middle] # make a bar width ... ? ... #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5 # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if kind in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kind == 'pie': if pie_legend: legend = True else: legend = False if kind in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # cut dataframe if just_totals try: tst = dataframe['Combined total'] dataframe = dataframe.head(num_to_plot) except: pass # rotate automatically if 'rot' not in kwargs: if not was_series: xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]] #if 'kind' in kwargs: #if kwargs['kind'] in ['barh', 'area']: #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] else: xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] if len(max(xvals, key=len)) > 6: if not piemode: kwargs['rot'] = 45 else: kwargs['rot'] = False # no title for subplots because ugly, if title and not sbplt: kwargs['title'] = title # no interactive subplots yet: if sbplt and interactive: import warnings interactive = False warnings.warn('No interactive subplots yet, sorry.') return # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: if num_to_plot > 6: if not kwargs.get('ncol'): kwargs['ncol'] = num_to_plot / 7 # kwarg options go in leg_options leg_options = { 'framealpha': .8, 'shadow': kwargs.get('shadow', False), 'ncol': kwargs.pop('ncol', 1) } # determine legend position based on this dict if legend_pos: possible = { 'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left' } if type(legend_pos) == int: the_loc = legend_pos elif type(legend_pos) == str: try: the_loc = possible[legend_pos] except KeyError: raise KeyError( 'legend_pos value must be one of:\n%s\n or an int between 0-10.' % ', '.join(list(possible.keys()))) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if type(legend_pos) == str: if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes) if piemode: if partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode: if not sbplt: kwargs['y'] = list(dataframe.columns)[0] if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.columns) else: if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.index) def filler(df): pby = df.T.copy() for i in list(pby.columns): tot = pby[i].sum() pby[i] = pby[i] * 100.0 / tot return pby.T areamode = False if kind == 'area': areamode = True if legend is False: kwargs['legend'] = False # line highlighting option for interactive! if interactive: if 2 in interactive_types: if kind == 'line': kwargs['marker'] = ',' if not piemode: kwargs['alpha'] = 0.1 # convert dates --- works only in my current case! if plotting_a_totals_column or not was_series: try: can_it_be_int = int(list(dataframe.index)[0]) can_be_int = True except: can_be_int = False if can_be_int: if 1500 < int(list(dataframe.index)[0]): if 2050 > int(list(dataframe.index)[0]): n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A') dataframe = dataframe.set_index(n) if kwargs.get('filled'): if areamode or kind.startswith('bar'): dataframe = filler(dataframe) kwargs.pop('filled', None) MARKERSIZE = 4 COLORMAP = { 0: { 'marker': None, 'dash': (None, None) }, 1: { 'marker': None, 'dash': [5, 5] }, 2: { 'marker': "o", 'dash': (None, None) }, 3: { 'marker': None, 'dash': [1, 3] }, 4: { 'marker': "s", 'dash': [5, 2, 5, 2, 5, 10] }, 5: { 'marker': None, 'dash': [5, 3, 1, 2, 1, 10] }, 6: { 'marker': 'o', 'dash': (None, None) }, 7: { 'marker': None, 'dash': [5, 3, 1, 3] }, 8: { 'marker': "1", 'dash': [1, 3] }, 9: { 'marker': "*", 'dash': [5, 5] }, 10: { 'marker': "2", 'dash': [5, 2, 5, 2, 5, 10] }, 11: { 'marker': "s", 'dash': (None, None) } } HATCHES = { 0: { 'color': '#dfdfdf', 'hatch': "/" }, 1: { 'color': '#6f6f6f', 'hatch': "\\" }, 2: { 'color': 'b', 'hatch': "|" }, 3: { 'color': '#dfdfdf', 'hatch': "-" }, 4: { 'color': '#6f6f6f', 'hatch': "+" }, 5: { 'color': 'b', 'hatch': "x" } } if black_and_white: if kind == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kind == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs['colormap'] = new_cmap class dummy_context_mgr(): """a fake context for plotting without style perhaps made obsolete by 'classic' style in new mpl""" def __enter__(self): return None def __exit__(self, one, two, three): return False with plt.style.context( (style)) if style != 'matplotlib' else dummy_context_mgr(): if not sbplt: # check if negative values, no stacked if so if areamode: kwargs['legend'] = False if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False ax = dataframe.plot(figsize=figsize, **kwargs) if areamode: handles, labels = plt.gca().get_legend_handles_labels() del handles del labels else: plt.gcf().set_tight_layout(False) if not piemode: ax = dataframe.plot(figsize=figsize, **kwargs) else: ax = dataframe.plot(figsize=figsize, **kwargs) handles, labels = plt.gca().get_legend_handles_labels() plt.legend(handles, labels, loc=leg_options['loc'], bbox_to_anchor=(0, -0.1, 1, 1), bbox_transform=plt.gcf().transFigure) # this line allows layouts with missing plots # i.e. layout = (5, 2) with only nine plots plt.gcf().set_tight_layout(False) if 'rot' in kwargs: if kwargs['rot'] != 0 and kwargs['rot'] != 90: labels = [item.get_text() for item in ax.get_xticklabels()] ax.set_xticklabels(labels, rotation=kwargs['rot'], ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white: if kind == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(list(COLORMAP.keys())): c = 0 # draw legend with proper placement etc if legend: if not piemode and not sbplt: if 3 not in interactive_types: handles, labels = plt.gca().get_legend_handles_labels() # area doubles the handles and labels. this removes half: if areamode: handles = handles[-len(handles) / 2:] labels = labels[-len(labels) / 2:] if rev_leg: handles = handles[::-1] labels = labels[::-1] lgd = plt.legend(handles, labels, **leg_options) if interactive: # 1 = highlight lines # 2 = line labels # 3 = legend switches ax = plt.gca() # fails for piemode lines = ax.lines handles, labels = plt.gca().get_legend_handles_labels() if 1 in interactive_types: plugins.connect(plt.gcf(), HighlightLines(lines)) if 3 in interactive_types: plugins.connect( plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0)) for i, l in enumerate(lines): y_vals = l.get_ydata() x_vals = l.get_xdata() x_vals = [str(x) for x in x_vals] if absolutes: ls = [ '%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals) ] else: ls = [ '%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals) ] if 2 in interactive_types: #if 'kind' in kwargs and kwargs['kind'] == 'area': tooltip_line = mpld3.plugins.LineLabelTooltip( lines[i], labels[i]) mpld3.plugins.connect(plt.gcf(), tooltip_line) #else: if kind == 'line': tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels=ls) mpld3.plugins.connect(plt.gcf(), tooltip_point) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' if x_label is not False: if type(x_label) == str: plt.xlabel(x_label) else: check_x_axis = list( dataframe.index )[0] # get first entry# get second entry of first entry (year, count) try: if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' check_x_axis = int(check_x_axis) if 1500 < check_x_axis < 2050: x_label = 'Year' else: x_label = 'Group' except: x_label = 'Group' if not sbplt: if not piemode: plt.xlabel(x_label) def is_number(s): """check if str can be can be made into float/int""" try: float(s) # for int, long and float except ValueError: try: complex(s) # for complex except ValueError: return False return True # for now, always turn off sci notation from matplotlib.ticker import ScalarFormatter if type(dataframe.index) != pandas.tseries.period.PeriodIndex: try: if all(is_number(s) for s in list(dataframe.index)): plt.gca().xaxis.set_major_formatter(ScalarFormatter()) except: pass try: if all(is_number(s) for s in list(dataframe.columns)): plt.gca().yaxis.set_major_formatter(ScalarFormatter()) except: pass # y labelling y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' def suplabel(axis, label, label_prop=None, labelpad=5, ha='center', va='center'): ''' Add super ylabel or xlabel to the figure Similar to matplotlib.suptitle axis - string: "x" or "y" label - string label_prop - keyword dictionary for Text labelpad - padding from the axis (default: 5) ha - horizontal alignment (default: "center") va - vertical alignment (default: "center") ''' fig = plt.gcf() xmin = [] ymin = [] for ax in fig.axes: xmin.append(ax.get_position().xmin) ymin.append(ax.get_position().ymin) xmin, ymin = min(xmin), min(ymin) dpi = fig.dpi if axis.lower() == "y": rotation = 90. x = xmin - float(labelpad) / dpi y = 0.5 elif axis.lower() == 'x': rotation = 0. x = 0.5 y = ymin - float(labelpad) / dpi else: raise Exception("Unexpected axis: x or y") if label_prop is None: label_prop = dict() plt.gcf().text(x, y, label, rotation=rotation, transform=fig.transFigure, ha=ha, va=va, **label_prop) if y_label is not False: if not sbplt: if not piemode: if type(y_label) == str: plt.ylabel(y_label) else: plt.ylabel(y_l) else: if type(y_label) == str: the_y = y_label else: the_y = y_l #suplabel('y', the_y, labelpad = 1.5) plt.gcf().text(0.04, 0.5, the_y, va='center', rotation='vertical') #plt.subplots_adjust(left=0.5) # if not piemode: # if type(y_label) == str: # plt.ylabel(y_label) # else: # plt.ylabel(y_l) # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.gca().suptitle(title, fontsize = 16) #plt.subplots_adjust(top=0.9) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') # show grid a.grid(b=kwargs.get('grid', False)) kwargs.pop('grid', None) # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: if kind.startswith('bar'): width = ax.containers[0][0].get_width() # show grid ax.grid(b=kwargs.get('grid', False)) kwargs.pop('grid', None) if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0, the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha='center', va='bottom') else: plt.annotate(score, (i, score), ha='center', va='bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha='center', va='bottom') else: plt.annotate(score, (i, score), ha='center', va='bottom') plt.subplots_adjust(left=0.1) plt.subplots_adjust(bottom=0.18) if 'layout' not in kwargs: if not sbplt: plt.tight_layout() if save: import os if running_python_tex: imagefolder = '../images' else: imagefolder = 'images' savename = get_savename(imagefolder, save=save, title=title, ext=output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o'): plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd, ), bbox_inches='tight', format=output_format) else: plt.gcf().savefig(savename, dpi=150, format=output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print('\n' + time + ": " + savename + " created.") else: raise ValueError("Error making %s." % savename) if dragmode: plt.legend().draggable() if sbplt: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) if not interactive and not running_python_tex and not running_spider \ and not tk: plt.gcf().show() return elif running_spider or tk: return plt if interactive: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) try: ax.legend_.remove() except: pass return mpld3.display()
# In[116]: plt.plot(ad.ix[:, 0], label="민수") # ad.ix[:, 0].name plt.plot(ad.ix[:, 1], label="영희", linestyle="--") plt.plot(ad.ix[:, 2], label="철수", linestyle=":") plt.title("기록 순위 비교 그래프", fontsize=16) plt.xlabel("연도", fontsize=14) plt.ylabel("순위", fontsize=14) plt.xlim([1999.9, 2002.1]) plt.ylim([0.9, 3.1]) plt.xticks([2000, 2001, 2002], ["2000년", "2001년", "2002년"]) plt.yticks([1, 2, 3]) plt.legend() plt.show() # 연도 별로 비교하고 싶다면 # In[117]: ac # In[118]: ac.rank(axis=1) # #### applymap # In[119]: ac.applymap(lambda x: "%d초" % x)
import numpy as np randn = np.random.randn import pandas as pd from pandas import Series, DataFrame frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) print frame f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) print 'fancy f(x)---------' def f(x): return Series([x.min(), x.max()], index=['min', 'max']) print frame.apply(f) format = lambda x: '%.2f' % x print frame.applymap(format) print 'sorting-------------' print frame.sort_index(by='b') print frame.rank(method='max', axis=1)
def test_default_handler(self): value = object() frame = DataFrame({"a": ["a", value]}) expected = frame.applymap(str) result = pd.read_json(frame.to_json(default_handler=str)) assert_frame_equal(expected, result, check_index_type=False)
def plotter(title, df, x_label = None, y_label = None, style = 'ggplot', figsize = (8, 4), save = False, legend_pos = 'best', reverse_legend = 'guess', num_to_plot = 7, tex = 'try', colours = 'Paired', cumulative = False, pie_legend = True, partial_pie = False, show_totals = False, transparent = False, output_format = 'png', interactive = False, black_and_white = False, show_p_val = False, indices = 'guess', **kwargs): """plot interrogator() or editor() output. **kwargs are for pandas first, which can then send them through to matplotlib.plot(): http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.plot.html http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot pie_legend: False to label slices rather than give legend show_totals: where to show percent/abs frequencies: False, 'plot', 'legend', or 'both' """ import corpkit import os import matplotlib as mpl if interactive: import matplotlib.pyplot as plt, mpld3 else: import matplotlib.pyplot as plt from matplotlib import rc import pandas import pandas as pd from pandas import DataFrame import numpy from time import localtime, strftime from corpkit.tests import check_pytex, check_spider, check_t_kinter if interactive: import mpld3 import collections from mpld3 import plugins, utils from plugins import InteractiveLegendPlugin, HighlightLines tk = check_t_kinter() running_python_tex = check_pytex() # incorrect spelling of spider on purpose running_spider = check_spider() def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100): """remove extreme values from colourmap --- no pure white""" import matplotlib.colors as colors import numpy as np new_cmap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))) return new_cmap def get_savename(imagefolder, save = False, title = False, ext = 'png'): """Come up with the savename for the image.""" import os def urlify(s): "Turn title into filename" import re s = s.lower() s = re.sub(r"[^\w\s-]", '', s) s = re.sub(r"\s+", '-', s) s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s) return s # name as if not ext.startswith('.'): ext = '.' + ext if type(save) == str: savename = os.path.join(imagefolder, (urlify(save) + ext)) #this 'else' is redundant now that title is obligatory else: if title: filename = urlify(title) + ext savename = os.path.join(imagefolder, filename) # remove duplicated ext if savename.endswith('%s%s' % (ext, ext)): savename = savename.replace('%s%s' % (ext, ext), ext, 1) return savename def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True): """adds totals (abs, rel, keyness) to entry name strings""" if was_series: where_the_words_are = dataframe.index else: where_the_words_are = dataframe.columns the_labs = [] for w in list(where_the_words_are): if not absolutes: if was_series: perc = dataframe.T[w][0] else: the_labs.append(w) continue if using_tex: the_labs.append('%s (%.2f\%%)' % (w, perc)) else: the_labs.append('%s (%.2f %%)' % (w, perc)) else: if was_series: score = dataframe.T[w].sum() else: score = dataframe[w].sum() if using_tex: the_labs.append('%s (n=%d)' % (w, score)) else: the_labs.append('%s (n=%d)' % (w, score)) if not was_series: dataframe.columns = the_labs else: vals = list(dataframe[list(dataframe.columns)[0]].values) dataframe = pd.DataFrame(vals, index = the_labs) dataframe.columns = ['Total'] return dataframe def auto_explode(dataframe, input, was_series = False, num_to_plot = 7): """give me a list of strings and i'll output explode option""" output = [0 for s in range(num_to_plot)] if was_series: l = list(dataframe.index) else: l = list(dataframe.columns) if type(input) == str or type(input) == int: input = [input] if type(input) == list: for i in input: if type(i) == str: index = l.index(i) else: index = i output[index] = 0.1 return output # are we doing subplots? sbplt = False if 'subplots' in kwargs: if kwargs['subplots'] is True: sbplt = True if colours is True: colours = 'Paired' styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight'] if style not in styles: raise ValueError('Style %s not found. Use %s' % (style, ', '.join(styles))) if 'savepath' in kwargs.keys(): mpl.rcParams['savefig.directory'] = kwargs['savepath'] del kwargs['savepath'] mpl.rcParams['savefig.bbox'] = 'tight' # try to use tex # TO DO: # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.latex.unicode'] = True if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if interactive: using_tex = False if show_totals is False: show_totals = 'none' # find out what kind of plot we're making, and enable # or disable interactive values if need be if 'kind' not in kwargs: kwargs['kind'] = 'line' if interactive: if kwargs['kind'].startswith('bar'): interactive_types = [3] elif kwargs['kind'] == 'area': interactive_types = [2, 3] elif kwargs['kind'] == 'line': interactive_types = [2, 3] elif kwargs['kind'] == 'pie': interactive_types = None warnings.warn('Interactive plotting not yet available for pie plots.') else: interactive_types = [None] if interactive is False: interactive_types = [None] # find out if pie mode, add autopct format piemode = False if 'kind' in kwargs: if kwargs['kind'] == 'pie': piemode = True # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' #if piemode: #if partial_pie: #kwargs['startangle'] = 180 kwargs['subplots'] = sbplt # copy data, make series into df dataframe = df.copy() was_series = False if type(dataframe) == pandas.core.series.Series: was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # attempt to convert x axis to ints: try: dataframe.index = [int(i) for i in list(dataframe.index)] except: pass # remove totals and tkinter order if not was_series: for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): dataframe = dataframe.drop(name, axis = ax, errors = 'ignore') else: dataframe = dataframe.drop('tkintertable-order', errors = 'ignore') dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore') # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': def isint(x): try: a = float(x) b = int(a) except ValueError or OverflowError: return False else: return a == b if all([isint(x) is True for x in list(dataframe.columns)]): indices = True else: indices = False # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf'] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if 'explode' in kwargs: if not piemode: del kwargs['explode'] if piemode: if 'explode' in kwargs: if not sbplt: kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series = was_series, num_to_plot = num_to_plot) if 'legend' in kwargs: legend = kwargs['legend'] else: legend = True #cut data short plotting_a_totals_column = False if was_series: if list(dataframe.columns)[0] != 'Total': try: can_be_ints = [int(x) for x in list(dataframe.index)] num_to_plot = len(dataframe) except: dataframe = dataframe[:num_to_plot] elif list(dataframe.columns)[0] == 'Total': plotting_a_totals_column = True if not 'legend' in kwargs: legend = False num_to_plot = len(dataframe) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis = 1) except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val: if there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] newname = '%s (p=%s)' % (col, format(pval, '.5f')) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis = 0, inplace = True) else: warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.') else: if there_are_p_vals: dataframe.drop(statfields, axis = 0, inplace = True) # make and set y label absolutes = True if type(dataframe) == pandas.core.frame.DataFrame: try: if not all([s.is_integer() for s in dataframe.iloc[0,:].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False # use colormap if need be: if num_to_plot > 0: if not was_series: if 'kind' in kwargs: if kwargs['kind'] in ['pie', 'line', 'area']: if colours: if not plotting_a_totals_column: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours #else: if colours: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours if piemode: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours else: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours #else: #if len(dataframe.T.columns) < 8: #try: #del kwargs['colormap'] #except: #pass # multicoloured bar charts if 'kind' in kwargs: if colours: if kwargs['kind'].startswith('bar'): if len(list(dataframe.columns)) == 1: if not black_and_white: import numpy as np the_range = np.linspace(0, 1, num_to_plot) cmap = plt.get_cmap(colours) kwargs['colors'] = [cmap(n) for n in the_range] # make a bar width ... ? #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5 # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if 'kind' in kwargs: if kwargs['kind'] in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kwargs['kind'] == 'pie': if pie_legend: legend = True else: legend = False if kwargs['kind'] in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # cut dataframe if just_totals try: tst = dataframe['Combined total'] dataframe = dataframe.head(num_to_plot) except: pass # rotate automatically if 'rot' not in kwargs: if not was_series: xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]] #if 'kind' in kwargs: #if kwargs['kind'] in ['barh', 'area']: #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] else: xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] if len(max(xvals, key=len)) > 6: if not piemode: kwargs['rot'] = 45 # no title for subplots because ugly, if sbplt: if 'title' in kwargs: del kwargs['title'] else: kwargs['title'] = title # no interactive subplots yet: if sbplt and interactive: import warnings interactive = False warnings.warn('No interactive subplots yet, sorry.') return # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: # kwarg options go in leg_options leg_options = {'framealpha': .8} if 'shadow' in kwargs: leg_options['shadow'] = True if 'ncol' in kwargs: leg_options['ncol'] = kwargs['ncol'] del kwargs['ncol'] else: if num_to_plot > 6: leg_options['ncol'] = num_to_plot / 7 # determine legend position based on this dict if legend_pos: possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left'} if type(legend_pos) == int: the_loc = legend_pos elif type(legend_pos) == str: try: the_loc = possible[legend_pos] except KeyError: raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(possible.keys())) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if type(legend_pos) == str: if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) if piemode: if partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode: if not sbplt: kwargs['y'] = list(dataframe.columns)[0] if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.columns) else: if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.index) areamode = False if 'kind' in kwargs: if kwargs['kind'] == 'area': areamode = True if legend is False: kwargs['legend'] = False # cumulative grab first col if cumulative: kwargs['y'] = list(dataframe.columns)[0] # line highlighting option for interactive! if interactive: if 2 in interactive_types: if kwargs['kind'] == 'line': kwargs['marker'] = ',' if not piemode: kwargs['alpha'] = 0.1 # convert dates --- works only in my current case! if plotting_a_totals_column or not was_series: try: can_it_be_int = int(list(dataframe.index)[0]) can_be_int = True except: can_be_int = False if can_be_int: if 1500 < int(list(dataframe.index)[0]): if 2050 > int(list(dataframe.index)[0]): n = pd.PeriodIndex([d for d in list(dataframe.index)], freq='A') dataframe = dataframe.set_index(n) MARKERSIZE = 4 COLORMAP = { 0: {'marker': None, 'dash': (None,None)}, 1: {'marker': None, 'dash': [5,5]}, 2: {'marker': "o", 'dash': (None,None)}, 3: {'marker': None, 'dash': [1,3]}, 4: {'marker': "s", 'dash': [5,2,5,2,5,10]}, 5: {'marker': None, 'dash': [5,3,1,2,1,10]}, 6: {'marker': 'o', 'dash': (None,None)}, 7: {'marker': None, 'dash': [5,3,1,3]}, 8: {'marker': "1", 'dash': [1,3]}, 9: {'marker': "*", 'dash': [5,5]}, 10: {'marker': "2", 'dash': [5,2,5,2,5,10]}, 11: {'marker': "s", 'dash': (None,None)} } HATCHES = { 0: {'color': '#dfdfdf', 'hatch':"/"}, 1: {'color': '#6f6f6f', 'hatch':"\\"}, 2: {'color': 'b', 'hatch':"|"}, 3: {'color': '#dfdfdf', 'hatch':"-"}, 4: {'color': '#6f6f6f', 'hatch':"+"}, 5: {'color': 'b', 'hatch':"x"} } if black_and_white: if kwargs['kind'] == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kwargs['kind'] == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs['colormap'] = new_cmap # use styles and plot with plt.style.context((style)): if not sbplt: # check if negative values, no stacked if so if areamode: if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False ax = dataframe.plot(figsize = figsize, **kwargs) else: if not piemode and not sbplt: ax = dataframe.plot(figsize = figsize, **kwargs) else: ax = dataframe.plot(figsize = figsize, **kwargs) handles, labels = plt.gca().get_legend_handles_labels() plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1), bbox_transform = plt.gcf().transFigure ) if not tk: plt.show() return if 'rot' in kwargs: if kwargs['rot'] != 0 and kwargs['rot'] != 90: labels = [item.get_text() for item in ax.get_xticklabels()] ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white: #plt.grid() plt.gca().set_axis_bgcolor('w') if kwargs['kind'] == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(COLORMAP.keys()): c = 0 if legend: if not piemode and not sbplt: if 3 not in interactive_types: if not rev_leg: lgd = plt.legend(**leg_options) else: handles, labels = plt.gca().get_legend_handles_labels() lgd = plt.legend(handles[::-1], labels[::-1], **leg_options) #if black_and_white: #lgd.set_facecolor('w') #if interactive: #if legend: #lgd.set_title("") #if not sbplt: #if 'layout' not in kwargs: #plt.tight_layout() if interactive: # 1 = highlight lines # 2 = line labels # 3 = legend switches ax = plt.gca() # fails for piemode lines = ax.lines handles, labels = plt.gca().get_legend_handles_labels() if 1 in interactive_types: plugins.connect(plt.gcf(), HighlightLines(lines)) if 3 in interactive_types: plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0)) for i, l in enumerate(lines): y_vals = l.get_ydata() x_vals = l.get_xdata() x_vals = [str(x) for x in x_vals] if absolutes: ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] else: ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] if 2 in interactive_types: #if 'kind' in kwargs and kwargs['kind'] == 'area': tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i]) mpld3.plugins.connect(plt.gcf(), tooltip_line) #else: if kwargs['kind'] == 'line': tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls) mpld3.plugins.connect(plt.gcf(), tooltip_point) # works: #plugins.connect(plt.gcf(), plugins.LineLabelTooltip(l, labels[i])) #labels = ["Point {0}".format(i) for i in range(num_to_plot)] #tooltip = plugins.LineLabelTooltip(lines) #mpld3.plugins.connect(plt.gcf(), mpld3.plugins.PointLabelTooltip(lines)) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' if x_label is not False: if type(x_label) == str: plt.xlabel(x_label) else: check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count) try: if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' check_x_axis = int(check_x_axis) if 1500 < check_x_axis < 2050: x_label = 'Year' else: x_label = 'Group' except: x_label = 'Group' if not sbplt: if not piemode: plt.xlabel(x_label) # no offsets for numerical x and y values if type(dataframe.index) != pandas.tseries.period.PeriodIndex: try: # check if x axis can be an int check_x_axis = list(dataframe.index)[0] can_it_be_int = int(check_x_axis) # if so, set these things from matplotlib.ticker import ScalarFormatter plt.gca().xaxis.set_major_formatter(ScalarFormatter()) except: pass # same for y axis try: # check if x axis can be an int check_y_axis = list(dataframe.columns)[0] can_it_be_int = int(check_y_axis) # if so, set these things from matplotlib.ticker import ScalarFormatter plt.gca().yaxis.set_major_formatter(ScalarFormatter()) except: pass # y labelling y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' if y_label is not False: if not sbplt: if not piemode: if type(y_label) == str: plt.ylabel(y_label) else: plt.ylabel(y_l) # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.suptitle(title, fontsize = 16) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: if 'kind' in kwargs: if kwargs['kind'].startswith('bar'): width = ax.containers[0][0].get_width() if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0,the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') #if not running_python_tex: #plt.gcf().show() plt.subplots_adjust(left=0.1) plt.subplots_adjust(bottom=0.18) #if 'layout' not in kwargs: #plt.tight_layout() if save: import os if running_python_tex: imagefolder = '../images' else: imagefolder = 'images' savename = get_savename(imagefolder, save = save, title = title, ext = output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o'): plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format) else: plt.gcf().savefig(savename, dpi=150, format = output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print '\n' + time + ": " + savename + " created." else: raise ValueError("Error making %s." % savename) if not interactive and not running_python_tex and not running_spider and not tk: plt.show() return if running_spider or tk or sbplt: return plt if interactive: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) try: ax.legend_.remove() except: pass return mpld3.display()
#通过dataframe的apply方法,将函数应用到各列或各行所形成的一维数组上(序列级) f = lambda x: x.max() - x.min() #定义匿名函数f,返回极差 frame.apply(f) #默认axis=0,应用到列上求每列的极差 frame.apply(f, axis=1) #应用到行上,求每行的极差 #许多常见的数组统计功能都被实现成了dataframe的方法(如sum,mean),则无需使用apply方法 #除标量外,apply还可根据传递的函数性质返回多个值组成的series def f(x): return Series([x.min(), x.max()], index=['min', 'max']) frame.apply(f) #返回每列中最小值和最大值组成的series #python的元素级函数也可应用于dataframe,此时需使用applymap函数 format = lambda x: '%.2f' % x #定义元素的格式匿名函数f frame.applymap(format) #series也有同样的应用于元素级的函数map frame['e'].map(format) #数据集的排序 #按索引排序 obj = Series(range(4), index=['d', 'a', 'b', 'c']) obj.sort_index() #按索引排序 #对dataframe,可指定任意一个轴向排序 frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) frame.sort_index() #默认按行索引排序 frame.sort_index(axis=1) #按列索引排序
def plotter(df, title=False, kind='line', x_label=None, y_label=None, style='ggplot', figsize=(8, 4), save=False, legend_pos='best', reverse_legend='guess', num_to_plot=6, tex='try', colours='default', cumulative=False, pie_legend=True, partial_pie=False, show_totals=False, transparent=False, output_format='png', interactive=False, black_and_white=False, show_p_val=False, indices=False, transpose=False, rot=False, **kwargs): """Visualise corpus interrogations. :param title: A title for the plot :type title: str :param df: Data to be plotted :type df: Pandas DataFrame :param x_label: A label for the x axis :type x_label: str :param y_label: A label for the y axis :type y_label: str :param kind: The kind of chart to make :type kind: str ('line'/'bar'/'barh'/'pie'/'area') :param style: Visual theme of plot :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc) :param figsize: Size of plot :type figsize: tuple (int, int) :param save: If bool, save with *title* as name; if str, use str as name :type save: bool/str :param legend_pos: Where to place legend :type legend_pos: str ('upper right'/'outside right'/etc) :param reverse_legend: Reverse the order of the legend :type reverse_legend: bool :param num_to_plot: How many columns to plot :type num_to_plot: int/'all' :param tex: Use TeX to draw plot text :type tex: bool :param colours: Colourmap for lines/bars/slices :type colours: str :param cumulative: Plot values cumulatively :type cumulative: bool :param pie_legend: Show a legend for pie chart :type pie_legend: bool :param partial_pie: Allow plotting of pie slices only :type partial_pie: bool :param show_totals: Print sums in plot where possible :type show_totals: str -- 'legend'/'plot'/'both' :param transparent: Transparent .png background :type transparent: bool :param output_format: File format for saved image :type output_format: str -- 'png'/'pdf' :param black_and_white: Create black and white line styles :type black_and_white: bool :param show_p_val: Attempt to print p values in legend if contained in df :type show_p_val: bool :param indices: To use when plotting "distance from root" :type indices: bool :param stacked: When making bar chart, stack bars on top of one another :type stacked: str :param filled: For area and bar charts, make every column sum to 100 :type filled: str :param legend: Show a legend :type legend: bool :param rot: Rotate x axis ticks by *rot* degrees :type rot: int :param subplots: Plot each column separately :type subplots: bool :param layout: Grid shape to use when *subplots* is True :type layout: tuple -- (int, int) :param interactive: Experimental interactive options :type interactive: list -- [1, 2, 3] :returns: matplotlib figure """ import corpkit import os try: from IPython.utils.shimmodule import ShimWarning import warnings warnings.simplefilter('ignore', ShimWarning) except: pass kwargs['rot'] = rot xtickspan = kwargs.pop('xtickspan', False) # prefer seaborn plotting try: import seaborn as sns except (ImportError, AttributeError): pass import matplotlib as mpl from matplotlib import rc if interactive: import matplotlib.pyplot as plt, mpld3 else: import matplotlib.pyplot as plt import matplotlib.ticker as ticker import pandas from pandas import DataFrame, Series, MultiIndex from time import localtime, strftime from process import checkstack if interactive: import mpld3 import collections from mpld3 import plugins, utils from plugins import InteractiveLegendPlugin, HighlightLines have_mpldc = False try: from mpldatacursor import datacursor, HighlightingDataCursor have_mpldc = True except ImportError: pass # if the data was multiindexed, the default is a little different! from corpkit.interrogation import Interrogation if isinstance(df.index, MultiIndex): import matplotlib.pyplot as nplt shape = kwargs.get('shape', 'auto') truncate = kwargs.get('truncate', 8) if shape == 'auto': shape = (int(len(df.index.levels[0]) / 2), 2) f, axes = nplt.subplots(*shape) for i, ((name, data), ax) in enumerate(zip(df.groupby(level=0), axes.flatten())): data = data.loc[name] if isinstance(truncate, int) and i > truncate: continue if kwargs.get('name_format'): name = kwargs.get('name_format').format(name) data = Interrogation(results=data, totals=data.sum(axis=1), query=None) data.visualise(title=name, ax=ax, kind=kind, x_label=x_label, y_label=y_label, style=style, figsize=figsize, save=save, legend_pos=legend_pos, reverse_legend=reverse_legend, num_to_plot=num_to_plot, tex=tex, colours=colours, cumulative=cumulative, pie_legend=pie_legend, partial_pie=partial_pie, show_totals=show_totals, transparent=transparent, output_format=output_format, interactive=interactive, black_and_white=black_and_white, show_p_val=show_p_val, indices=indices, transpose=transpose, rot=rot) return nplt def copy(self): from corpkit.interrogation import Interrodict copied = {} for k, v in self.items(): copied[k] = v return Interrodict(copied) # check what environment we're in tk = checkstack('tkinter') running_python_tex = checkstack('pythontex') running_spider = checkstack('spyder') if not title: title = '' def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100): """remove extreme values from colourmap --- no pure white""" import matplotlib.colors as colors import numpy as np new_cmap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))) return new_cmap def get_savename(imagefolder, save=False, title=False, ext='png'): """Come up with the savename for the image.""" import os from corpkit.process import urlify # name as if not ext.startswith('.'): ext = '.' + ext if isinstance(save, STRINGTYPE): savename = os.path.join(imagefolder, (urlify(save) + ext)) #this 'else' is redundant now that title is obligatory else: if title: filename = urlify(title) + ext savename = os.path.join(imagefolder, filename) # remove duplicated ext if savename.endswith('%s%s' % (ext, ext)): savename = savename.replace('%s%s' % (ext, ext), ext, 1) return savename def rename_data_with_total(dataframe, was_series=False, using_tex=False, absolutes=True): """adds totals (abs, rel, keyness) to entry name strings""" if was_series: where_the_words_are = dataframe.index else: where_the_words_are = dataframe.columns the_labs = [] for w in list(where_the_words_are): if not absolutes: if was_series: perc = dataframe.T[w][0] else: the_labs.append(w) continue if using_tex: the_labs.append('%s (%.2f\%%)' % (w, perc)) else: the_labs.append('%s (%.2f %%)' % (w, perc)) else: if was_series: score = dataframe.T[w].sum() else: score = dataframe[w].sum() if using_tex: the_labs.append('%s (n=%d)' % (w, score)) else: the_labs.append('%s (n=%d)' % (w, score)) if not was_series: dataframe.columns = the_labs else: vals = list(dataframe[list(dataframe.columns)[0]].values) dataframe = pandas.DataFrame(vals, index=the_labs) dataframe.columns = ['Total'] return dataframe def auto_explode(dataframe, tinput, was_series=False, num_to_plot=7): """give me a list of strings and i'll output explode option""" output = [0 for s in range(num_to_plot)] if was_series: l = list(dataframe.index) else: l = list(dataframe.columns) if isinstance(tinput, (STRINGTYPE, int)): tinput = [tinput] if isinstance(tinput, list): for i in tinput: if isinstance(i, STRINGTYPE): index = l.index(i) else: index = i output[index] = 0.1 return output # get a few options from kwargs sbplt = kwargs.get('subplots', False) show_grid = kwargs.pop('grid', True) the_rotation = kwargs.get('rot', False) dragmode = kwargs.pop('draggable', False) leg_frame = kwargs.pop('legend_frame', True) leg_alpha = kwargs.pop('legend_alpha', 0.8) # auto set num to plot based on layout lo = kwargs.get('layout', None) if lo: num_to_plot = lo[0] * lo[1] # todo: get this dynamically instead. styles = [ 'dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white' ] #if style not in styles: #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles))) if style == 'mpl-white': try: sns.set_style("whitegrid") except: pass style = 'matplotlib' if kwargs.get('savepath'): mpl.rcParams['savefig.directory'] = kwargs.get('savepath') kwargs.pop('savepath', None) mpl.rcParams['savefig.bbox'] = 'tight' mpl.rcParams.update({'figure.autolayout': True}) # try to use tex # TO DO: # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.latex.unicode'] = True if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if interactive: using_tex = False if show_totals is False: show_totals = 'none' # find out what kind of plot we're making, and enable # or disable interactive values if need be kwargs['kind'] = kind.lower() if interactive: if kwargs['kind'].startswith('bar'): interactive_types = [3] elif kwargs['kind'] == 'area': interactive_types = [2, 3] elif kwargs['kind'] == 'line': interactive_types = [2, 3] elif kwargs['kind'] == 'pie': interactive_types = None warnings.warn( 'Interactive plotting not yet available for pie plots.') else: interactive_types = [None] if interactive is False: interactive_types = [None] # find out if pie mode, add autopct format piemode = False if kind == 'pie': piemode = True # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' # copy data, make series into df dataframe = df.copy() if kind == 'heatmap': try: dataframe = dataframe.T except: pass was_series = False if isinstance(dataframe, Series): was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if transpose: dataframe = dataframe.T if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # attempt to convert x axis to ints: #try: # dataframe.index = [int(i) for i in list(dataframe.index)] #except: # pass # remove totals and tkinter order if not was_series: for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): try: dataframe = dataframe.drop(name, axis=ax, errors='ignore') except: pass try: dataframe = dataframe.drop('tkintertable-order', errors='ignore') except: pass try: dataframe = dataframe.drop('tkintertable-order', axis=1, errors='ignore') except: pass # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': def isint(x): try: a = float(x) b = int(a) except (ValueError, OverflowError): return False else: return a == b if all([isint(x) is True for x in list(dataframe.columns)]): indices = True else: indices = False # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = [ 'svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf' ] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if piemode and not sbplt and kwargs.get('explode'): kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series=was_series, num_to_plot=num_to_plot) else: kwargs.pop('explode', None) legend = kwargs.get('legend', True) #cut data short plotting_a_totals_column = False if was_series: if list(dataframe.columns)[0] != 'Total': try: can_be_ints = [int(x) for x in list(dataframe.index)] num_to_plot = len(dataframe) except: dataframe = dataframe[:num_to_plot] elif list(dataframe.columns)[0] == 'Total': plotting_a_totals_column = True if not 'legend' in kwargs: legend = False num_to_plot = len(dataframe) else: if transpose: dataframe = dataframe.head(num_to_plot) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis=1, errors='ignore') except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val: if there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] def p_string_formatter(val): if val < 0.001: if not using_tex: return 'p < 0.001' else: return r'p $<$ 0.001' else: return 'p = %s' % format(val, '.3f') pstr = p_string_formatter(pval) newname = '%s (%s)' % (col, pstr) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis=0, inplace=True, errors='ignore') else: warnings.warn( 'No p-values calculated to show.\n\nUse keep_stats kwarg while editing to generate these values.' ) else: if there_are_p_vals: dataframe.drop(statfields, axis=0, inplace=True, errors='ignore') # make and set y label absolutes = True if isinstance(dataframe, DataFrame): try: if not all([s.is_integer() for s in dataframe.iloc[0, :].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False ########################################## ################ COLOURS ################# ########################################## # set defaults, with nothing for heatmap yet if colours is True or colours == 'default' or colours == 'Default': if kind != 'heatmap': colours = 'viridis' else: colours = 'default' # assume it's a single color, unless string denoting map cmap_or_c = 'color' if isinstance(colours, str): cmap_or_c = 'colormap' from matplotlib.colors import LinearSegmentedColormap if isinstance(colours, LinearSegmentedColormap): cmap_or_c = 'colormap' # for heatmaps, it's always a colormap if kind == 'heatmap': cmap_or_c = 'cmap' # if it's a defaulty string, set accordingly if isinstance(colours, str): if colours.lower().startswith('diverg'): colours = sns.diverging_palette(10, 133, as_cmap=True) # if default not set, do diverge for any df with a number < 0 elif colours.lower() == 'default': mn = dataframe.min() if isinstance(mn, Series): mn = mn.min() if mn < 0: colours = sns.diverging_palette(10, 133, as_cmap=True) else: colours = sns.light_palette("green", as_cmap=True) if 'seaborn' not in style: kwargs[cmap_or_c] = colours #if not was_series: # if kind in ['pie', 'line', 'area']: # if colours and not plotting_a_totals_column: # kwargs[cmap_or_c] = colours # else: # if colours: # kwargs[cmap_or_c] = colours #if piemode: # if num_to_plot > 0: # kwargs[cmap_or_c] = colours # else: # if num_to_plot > 0: # kwargs[cmap_or_c] = colours # multicoloured bar charts #if colours and cmap_or_c == 'colormap': # if kind.startswith('bar'): # if len(list(dataframe.columns)) == 1: # if not black_and_white: # import numpy as np # the_range = np.linspace(0, 1, num_to_plot) # middle = len(the_range) / 2 # try: # cmap = plt.get_cmap(colours) # kwargs[cmap_or_c] = [cmap(n) for n in the_range][middle] # except ValueError: # kwargs[cmap_or_c] = colours # # make a bar width ... ? ... # #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5 # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if kind in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kind == 'pie': if pie_legend: legend = True else: legend = False if kind in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # cut dataframe if just_totals try: tst = dataframe['Combined total'] dataframe = dataframe.head(num_to_plot) except: pass # no title for subplots because ugly, if title and not sbplt: kwargs['title'] = title # no interactive subplots yet: if sbplt and interactive: import warnings interactive = False warnings.warn('No interactive subplots yet, sorry.') return # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: if num_to_plot > 6: if not kwargs.get('ncol'): kwargs['ncol'] = num_to_plot // 7 # kwarg options go in leg_options leg_options = { 'framealpha': leg_alpha, 'shadow': kwargs.get('shadow', False), 'ncol': kwargs.pop('ncol', 1) } # determine legend position based on this dict if legend_pos: possible = { 'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left' } if isinstance(legend_pos, int): the_loc = legend_pos elif isinstance(legend_pos, str): try: the_loc = possible[legend_pos] except KeyError: raise KeyError( 'legend_pos value must be one of:\n%s\n or an int between 0-10.' % ', '.join(list(possible.keys()))) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if isinstance(legend_pos, str): if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes) if piemode: if partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode: if not sbplt: kwargs['y'] = list(dataframe.columns)[0] def filler(df): pby = df.T.copy() for i in list(pby.columns): tot = pby[i].sum() pby[i] = pby[i] * 100.0 / tot return pby.T areamode = False if kind == 'area': areamode = True if legend is False: kwargs['legend'] = False # line highlighting option for interactive! if interactive: if 2 in interactive_types: if kind == 'line': kwargs['marker'] = ',' if not piemode: kwargs['alpha'] = 0.1 # convert dates --- works only in my current case! #if plotting_a_totals_column or not was_series: # try: # can_it_be_int = int(list(dataframe.index)[0]) # can_be_int = True # except: # can_be_int = False # if can_be_int: # if 1500 < int(list(dataframe.index)[0]): # if 2050 > int(list(dataframe.index)[0]): # n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A') # dataframe = dataframe.set_index(n) if kwargs.get('filled'): if areamode or kind.startswith('bar'): dataframe = filler(dataframe) kwargs.pop('filled', None) MARKERSIZE = 4 COLORMAP = { 0: { 'marker': None, 'dash': (None, None) }, 1: { 'marker': None, 'dash': [5, 5] }, 2: { 'marker': "o", 'dash': (None, None) }, 3: { 'marker': None, 'dash': [1, 3] }, 4: { 'marker': "s", 'dash': [5, 2, 5, 2, 5, 10] }, 5: { 'marker': None, 'dash': [5, 3, 1, 2, 1, 10] }, 6: { 'marker': 'o', 'dash': (None, None) }, 7: { 'marker': None, 'dash': [5, 3, 1, 3] }, 8: { 'marker': "1", 'dash': [1, 3] }, 9: { 'marker': "*", 'dash': [5, 5] }, 10: { 'marker': "2", 'dash': [5, 2, 5, 2, 5, 10] }, 11: { 'marker': "s", 'dash': (None, None) } } HATCHES = { 0: { 'color': '#dfdfdf', 'hatch': "/" }, 1: { 'color': '#6f6f6f', 'hatch': "\\" }, 2: { 'color': 'b', 'hatch': "|" }, 3: { 'color': '#dfdfdf', 'hatch': "-" }, 4: { 'color': '#6f6f6f', 'hatch': "+" }, 5: { 'color': 'b', 'hatch': "x" } } if black_and_white: if kind == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kind == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs[cmap_or_c] = new_cmap # remove things from kwargs if heatmap if kind == 'heatmap': hmargs = { 'annot': kwargs.pop('annot', True), cmap_or_c: kwargs.pop(cmap_or_c, None), 'fmt': kwargs.pop('fmt', ".2f"), 'cbar': kwargs.pop('cbar', False) } for i in [ 'vmin', 'vmax', 'linewidths', 'linecolor', 'robust', 'center', 'cbar_kws', 'cbar_ax', 'square', 'mask', 'norm' ]: if i in kwargs.keys(): hmargs[i] = kwargs.pop(i, None) class dummy_context_mgr(): """a fake context for plotting without style perhaps made obsolete by 'classic' style in new mpl""" def __enter__(self): return None def __exit__(self, one, two, three): return False with plt.style.context( (style)) if style != 'matplotlib' else dummy_context_mgr(): kwargs.pop('filled', None) if not sbplt: # check if negative values, no stacked if so if areamode: if not kwargs.get('ax'): kwargs['legend'] = False if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False if kind != 'heatmap': # turn off pie labels at the last minute if kind == 'pie' and pie_legend: kwargs['labels'] = None kwargs['autopct'] = '%.2f' if kind == 'pie': kwargs.pop('color', None) ax = dataframe.plot(figsize=figsize, **kwargs) else: fg = plt.figure(figsize=figsize) if title: plt.title(title) ax = kwargs.get('ax', plt.axes()) tmp = sns.heatmap(dataframe, ax=ax, **hmargs) ax.set_title(title) for item in tmp.get_yticklabels(): item.set_rotation(0) plt.close(fg) if areamode and not kwargs.get('ax'): handles, labels = plt.gca().get_legend_handles_labels() del handles del labels if x_label: ax.set_xlabel(x_label) if y_label: ax.set_ylabel(y_label) else: if not kwargs.get('layout'): plt.gcf().set_tight_layout(False) if kind != 'heatmap': ax = dataframe.plot(figsize=figsize, **kwargs) else: plt.figure(figsize=figsize) if title: plt.title(title) ax = plt.axes() sns.heatmap(dataframe, ax=ax, **hmargs) plt.xticks(rotation=0) plt.yticks(rotation=0) def rotate_degrees(rotation, labels): if rotation is None: if max(labels, key=len) > 6: return 45 else: return 0 elif rotation is False: return 0 elif rotation is True: return 45 else: return rotation if sbplt: if 'layout' not in kwargs: axes = [l for l in ax] else: axes = [] cols = [l for l in ax] for col in cols: for bit in col: axes.append(bit) for index, a in enumerate(axes): if xtickspan is not False: a.xaxis.set_major_locator( ticker.MultipleLocator(xtickspan)) labels = [item.get_text() for item in a.get_xticklabels()] rotation = rotate_degrees(the_rotation, labels) try: if the_rotation == 0: ax.set_xticklabels(labels, rotation=rotation, ha='center') else: ax.set_xticklabels(labels, rotation=rotation, ha='right') except AttributeError: pass else: if kind == 'heatmap': labels = [item.get_text() for item in ax.get_xticklabels()] rotation = rotate_degrees(the_rotation, labels) if the_rotation == 0: ax.set_xticklabels(labels, rotation=rotation, ha='center') else: ax.set_xticklabels(labels, rotation=rotation, ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white: if kind == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(list(COLORMAP.keys())): c = 0 # draw legend with proper placement etc if legend: if not piemode and not sbplt and kind != 'heatmap': if 3 not in interactive_types: handles, labels = plt.gca().get_legend_handles_labels() # area doubles the handles and labels. this removes half: #if areamode: # handles = handles[-len(handles) / 2:] # labels = labels[-len(labels) / 2:] if rev_leg: handles = handles[::-1] labels = labels[::-1] if kwargs.get('ax'): lgd = plt.gca().legend(handles, labels, **leg_options) ax.get_legend().draw_frame(leg_frame) else: lgd = plt.legend(handles, labels, **leg_options) lgd.draw_frame(leg_frame) if interactive: # 1 = highlight lines # 2 = line labels # 3 = legend switches ax = plt.gca() # fails for piemode lines = ax.lines handles, labels = plt.gca().get_legend_handles_labels() if 1 in interactive_types: plugins.connect(plt.gcf(), HighlightLines(lines)) if 3 in interactive_types: plugins.connect( plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0)) for i, l in enumerate(lines): y_vals = l.get_ydata() x_vals = l.get_xdata() x_vals = [str(x) for x in x_vals] if absolutes: ls = [ '%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals) ] else: ls = [ '%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals) ] if 2 in interactive_types: #if 'kind' in kwargs and kwargs['kind'] == 'area': tooltip_line = mpld3.plugins.LineLabelTooltip( lines[i], labels[i]) mpld3.plugins.connect(plt.gcf(), tooltip_line) #else: if kind == 'line': tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels=ls) mpld3.plugins.connect(plt.gcf(), tooltip_point) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now if isinstance(dataframe.index, pandas.tseries.period.PeriodIndex): x_label = 'Year' y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.gca().suptitle(title, fontsize = 16) #plt.subplots_adjust(top=0.9) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass #try: # from matplotlib.ticker import MaxNLocator # from corpkit.process import is_number # indx = list(dataframe.index) # if all([is_number(qq) for qq in indx]): # ax.get_xaxis().set_major_locator(MaxNLocator(integer=True)) #except: # pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') a.grid(b=show_grid) # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: # show grid ax.grid(b=show_grid) if kind.startswith('bar'): width = ax.containers[0][0].get_width() if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0, the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha='center', va='bottom') else: plt.annotate(score, (i, score), ha='center', va='bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha='center', va='bottom') else: plt.annotate(score, (i, score), ha='center', va='bottom') if not kwargs.get('layout') and not sbplt and not kwargs.get('ax'): plt.tight_layout() if kwargs.get('ax'): try: plt.gcf().set_tight_layout(False) except: pass try: plt.set_tight_layout(False) except: pass if save: if running_python_tex: imagefolder = '../images' else: imagefolder = 'images' savename = get_savename(imagefolder, save=save, title=title, ext=output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o') and not sbplt: plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd, ), bbox_inches='tight', format=output_format) else: plt.gcf().savefig(savename, dpi=150, format=output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print('\n' + time + ": " + savename + " created.") else: raise ValueError("Error making %s." % savename) if dragmode: plt.legend().draggable() if sbplt: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) # add DataCursor to notebook backend if possible if have_mpldc: if kind == 'line': HighlightingDataCursor( plt.gca().get_lines(), highlight_width=4, highlight_color=False, formatter=lambda **kwargs: '%s: %s' % (kwargs['label'], "{0:.3f}".format(kwargs['y']))) else: datacursor(formatter=lambda **kwargs: '%s: %s' % (kwargs['label'], "{0:.3f}".format(kwargs['height']))) #if not interactive and not running_python_tex and not running_spider \ # and not tk: # plt.gcf().show() # return plt #elif running_spider or tk: # return plt if interactive: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) try: ax.legend_.remove() except: pass return mpld3.display() else: return plt
emissionsdf.index = timelist emissionsdf.columns = countries #%% # The data is not linear. Take the log-log. def mylog(input): if math.isnan(input): return (input) else: return (math.log(input)) gdplogdf = gdpdf.applymap(mylog) emissionslogdf = emissionsdf.applymap(mylog) #%% # Perform the regression. for country in countries[countries != "Czech Republic"]: somedf = pd.concat([gdplogdf[[country]], emissionslogdf[[country]]], axis=1) otherdf = somedf.dropna() regression = linear_model.LinearRegression() regression.fit(otherdf[[0]].values, otherdf[[1]].values) missing = somedf[np.isnan(somedf[[1]].values)] missing[[ 1 ]] = regression.intercept_ + regression.coef_ * missing[[0]].values
def df_to_dict_for_json(df: pd.DataFrame) -> dict: logging.debug('df_to_dict_for_json input df: {}'.format(df)) res = df.applymap(value2safejson).to_dict() logging.debug(res) return res
"Where " "not text is null " "order by id") total_cache = curs.fetchall() #getting all description of each tag to compose the vocabulary curs_vocab = conn.cursor() curs_vocab.execute("SELECT ID||' '||Description " "FROM Community.Tag " "where not id is null " "and not Description is null " "order by id") total_vocab = curs_vocab.fetchall() df_vocab = DataFrame(columns=["vocab"], data=total_vocab) df_vocab = df_vocab.applymap(lambda s: s.lower() if type(s) == str else s) curs_tags = conn.cursor() curs_tags.execute("SELECT ID " "FROM Community.Tag " "where not id is null order by id") total_tags = curs_tags.fetchall() df_tags = DataFrame(columns=["tags"], data=total_tags) df_tags = df_tags.applymap(lambda s: s.lower() if type(s) == str else s) df = DataFrame(total_cache) df.columns = [x[0].lower() for x in curs.description] def clean_text(text): text = re.sub(r"what's", "what is ", text)
class Scores(object): """ Parameters ---------- uri : str, optional modality : str, optional Returns ------- scores : `Scores` Examples -------- >>> s = Scores(uri='video', modality='speaker') >>> s[Segment(0,1), 's1', 'A'] = 0.1 >>> s[Segment(0,1), 's1', 'B'] = 0.2 >>> s[Segment(0,1), 's1', 'C'] = 0.3 >>> s[Segment(0,1), 's2', 'A'] = 0.4 >>> s[Segment(0,1), 's2', 'B'] = 0.3 >>> s[Segment(0,1), 's2', 'C'] = 0.2 >>> s[Segment(2,3), 's1', 'A'] = 0.2 >>> s[Segment(2,3), 's1', 'B'] = 0.1 >>> s[Segment(2,3), 's1', 'C'] = 0.3 """ @classmethod def from_df(cls, df, uri=None, modality=None, aggfunc=np.mean): """ Parameters ---------- df : DataFrame Must contain the following columns: 'segment', 'track', 'label' and 'value' uri : str, optional Resource identifier modality : str, optional Modality aggfunc : func Value aggregation function in case of duplicate (segment, track, label) tuples Returns ------- """ dataframe = pivot_table(df, values=PYANNOTE_SCORE, index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL, aggfunc=aggfunc) annotation = Annotation(uri=uri, modality=modality) for index, _ in dataframe.iterrows(): segment = Segment(*index[0]) track = index[1] annotation[segment, track] = '' labels = dataframe.columns return cls(uri=uri, modality=modality, annotation=annotation, labels=labels, values=dataframe.values) def __init__(self, uri=None, modality=None, annotation=None, labels=None, values=None, dtype=None): super(Scores, self).__init__() names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] if annotation: annotation = annotation.copy() index = Index([s + (t, ) for s, t in annotation.itertracks()], name=names) else: annotation = Annotation(uri=uri, modality=modality) index = MultiIndex(levels=[list() for name in names], labels=[list() for name in names], names=names) self.annotation_ = annotation columns = None if labels is None else list(labels) data = None if values is None else np.array(values) dtype = np.float if values is None else values.dtype self.dataframe_ = DataFrame(data=data, dtype=dtype, index=index, columns=columns) self.hasChanged_ = True self.modality = modality self.uri = uri def copy(self): self._reindexIfNeeded() copied = self.__class__(uri=self.uri, modality=self.modality) copied.dataframe_ = self.dataframe_.copy() copied.annotation_ = self.annotation_.copy() copied.hasChanged_ = self.hasChanged_ return copied # del scores[segment] # del scores[segment, :] # del scores[segment, track] def __delitem__(self, key): if isinstance(key, Segment): segment = key self.dataframe_.drop(tuple(segment), axis=0, inplace=True) del self.annotation_[segment] self.hasChanged_ = True elif isinstance(key, tuple) and len(key) == 2: segment, track = key self.dataframe_.drop(tuple(segment) + (track, ), axis=0, inplace=True) del self.annotation_[segment, track] self.hasChanged_ = True else: raise KeyError('') # value = scores[segment, track, label] def __getitem__(self, key): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key return self.dataframe_.at[tuple(segment) + (track, ), label] # scores[segment, track, label] = value # scores[segment, label] ==== scores[segment, '_', label] def __setitem__(self, key, value): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key # do not add empty track if not segment: return self.dataframe_.at[tuple(segment) + (track, ), label] = value self.annotation_[segment, track] = label self.hasChanged_ = True def __len__(self): """Number of annotated segments""" return len(self.annotation_) def __nonzero__(self): return self.__bool__() def __bool__(self): """False if annotation is empty""" return True if self.annotation_ else False def __contains__(self, included): """Check if segments are annotated Parameters ---------- included : `Segment` or `Timeline` Returns ------- contains : bool True if every segment in `included` is annotated, False otherwise. """ return included in self.annotation_ def __iter__(self): """Iterate over sorted segments""" return iter(self.annotation_.get_timeline(copy=False)) def __reversed__(self): """Reverse iterate over sorted segments""" return reversed(self.annotation_.get_timeline(copy=False)) def itersegments(self): return iter(self) def tracks(self, segment): """Set of tracks for query segment Parameters ---------- segment : `Segment` Query segment Returns ------- tracks : set Set of tracks for query segment """ return self.annotation_.get_tracks(segment) def has_track(self, segment, track): """Check whether a given track exists Parameters ---------- segment : `Segment` Query segment track : Query track Returns ------- exists : bool True if track exists for segment """ return self.annotation_.has_track(segment, track) def get_track_by_name(self, track): """Get all tracks with given name Parameters ---------- track : any valid track name Requested name track Returns ------- tracks : list List of (segment, track) tuples """ return self.annotation_.get_track_by_name(track) def new_track(self, segment, candidate=None, prefix=None): """Track name generator Parameters ---------- segment : Segment prefix : str, optional candidate : any valid track name Returns ------- track : str New track name """ return self.annotation_.new_track(segment, candidate=None, prefix=None) def itertracks(self): """Iterate over annotation as (segment, track) tuple""" return self.annotation_.itertracks() def itervalues(self): """Iterate over scores as (segment, track, label, value) tuple""" # make sure segment/track pairs are sorted self._reindexIfNeeded() labels = self.labels() # yield one (segment, track, label) tuple per loop for index, columns in self.dataframe_.iterrows(): segment = Segment(*index[:-1]) track = index[-1] for label in labels: value = columns[label] if not np.isnan(value): yield segment, track, label, value def get_track_scores(self, segment, track): """Get all scores for a given track. Parameters ---------- segment : Segment track : hashable segment, track must be a valid track Returns ------- scores : dict {label: score} dictionary """ return dict(self.dataframe_.xs(tuple(segment) + (track, ))) def labels(self): """List of labels Returns ------- labels : list Sorted list of existing labels Remarks ------- Labels are sorted based on their string representation. """ return sorted(self.dataframe_.columns, key=str) def _reindexIfNeeded(self): if not self.hasChanged_: return names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index( [s + (t, ) for s, t in self.annotation_.itertracks()], name=names) self.dataframe_ = self.dataframe_.reindex(new_index) self.hasChanged_ = False return def retrack(self): """ """ self._reindexIfNeeded() retracked = self.copy() annotation = self.annotation_.retrack() retracked.annotation_ = annotation names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index([s + (t, ) for s, t in annotation.itertracks()], name=names) retracked.dataframe_.index = new_index return retracked def apply(self, func, axis=0): applied = self.copy() applied.dataframe_ = self.dataframe_.apply(func, axis=axis) applied.hasChanged_ = True return applied def rank(self, ascending=False): """ Parameters ---------- ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- rank : `Scores` """ ranked = self.copy() ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) ranked.hasChanged_ = True return ranked def nbest(self, n, ascending=False): """ Parameters ---------- n : int Size of n-best list ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- nbest : `Scores` New scores where only n-best are kept. """ filtered = self.copy() ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n, other=np.NaN) filtered.hasChanged_ = True return filtered def subset(self, labels, invert=False): """Scores subset Extract scores subset based on labels Parameters ---------- labels : set Set of labels invert : bool, optional If invert is True, extract all but requested `labels` Returns ------- subset : `Scores` Scores subset. """ self._reindexIfNeeded() if not isinstance(labels, set): raise TypeError('labels must be provided as a set of labels.') if invert: labels = set(self.labels()) - labels else: labels = labels & set(self.labels()) subset = Scores(uri=self.uri, modality=self.modality) subset.annotation_ = self.annotation_ subset.dataframe_ = self.dataframe_[list(labels)] return subset def to_annotation(self, threshold=-np.inf, posterior=False): """ Parameters ---------- threshold : float, optional Each track is annotated with the label with the highest score. Yet, if the latter is smaller than `threshold`, label is replaced with an `Unknown` instance. posterior : bool, optional If True, scores are posterior probabilities in open-set identification. If top model posterior is higher than unknown posterior, it is selected. Otherwise, label is replaced with an `Unknown` instance. """ if not self: return Annotation(uri=self.uri, modality=self.modality) best = self.nbest(1, ascending=False) large_enough = best.copy() if posterior: unknown_posterior = 1. - self.dataframe_.sum(axis=1) large_enough.dataframe_ = (((best.dataframe_.T > unknown_posterior) & (best.dataframe_.T > threshold)).T) else: large_enough.dataframe_ = ((best.dataframe_.T > threshold).T) large_enough.dataframe_.where(best.dataframe_.notnull(), inplace=True, other=np.NaN) annotation = Annotation(uri=self.uri, modality=self.modality) for segment, track, label, value in large_enough.itervalues(): label = label if value else Unknown() annotation[segment, track] = label return annotation def map(self, func): """Apply function to all values""" mapped = self.copy() mapped.dataframe_ = self.dataframe_.applymap(func) mapped.hasChanged_ = True return mapped def crop(self, focus, mode='strict'): """Crop on focus Parameters ---------- focus : `Segment` or `Timeline` mode : {'strict', 'loose', 'intersection'} In 'strict' mode, only segments fully included in focus coverage are kept. In 'loose' mode, any intersecting segment is kept unchanged. In 'intersection' mode, only intersecting segments are kept and replaced by their actual intersection with the focus. Returns ------- cropped : same type as caller Cropped version of the caller containing only tracks matching the provided focus and mode. Remarks ------- In 'intersection' mode, the best is done to keep the track names unchanged. However, in some cases where two original segments are cropped into the same resulting segments, conflicting track names are modified to make sure no track is lost. """ if isinstance(focus, Segment): return self.crop(Timeline([focus], uri=self.uri), mode=mode) self._reindexIfNeeded() cropped = self.copy() if mode in ['strict', 'loose']: new_annotation = self.annotation_.crop(focus, mode=mode) keep = [ new_annotation.has_track(segment, track) for segment, track in self.itertracks() ] cropped.dataframe_ = self.dataframe_[keep] cropped.annotation_ = new_annotation cropped.hasChanged_ = True return cropped elif mode in ['intersection']: raise NotImplementedError('') # # two original segments might be cropped into the same resulting # # segment -- therefore, we keep track of the mapping # intersection, mapping = timeline.crop(coverage, # mode=mode, mapping=True) # # # create new empty annotation # A = self.__class__(uri=self.uri, modality=self.modality) # # for cropped in intersection: # for original in mapping[cropped]: # for track in self.tracks(original): # # try to use original track name (candidate) # # if it already exists, create a brand new one # new_track = A.new_track(cropped, candidate=track) # # copy each value, column by column # for label in self.dataframe_.columns: # value = self.dataframe_.get_value((original, track), # label) # A.dataframe_ = A.dataframe_.set_value((cropped, new_track), # label, value) # # return A def __str__(self): """Human-friendly representation""" if self: self._reindexIfNeeded() return str(self.dataframe_) else: return "" def _repr_png_(self): from .notebook import repr_scores return repr_scores(self)
def setFloatPrecision(df: pd.DataFrame): df = df.applymap("${0:.2f}".format)
index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four']) # print data.ix['Colorado', ['two', 'three']] frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) series = frame.ix[0] # print series frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', "Oregon"]) f_format = lambda x: '%.2f' % x frame = frame.applymap(f_format) # print frame obj = Series(range(4), index=['d', 'a', 'b', 'c']) obj = obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) # print frame.sort_index(axis=1, ascending=False) frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) # print frame.sort_index(by=['a', 'b']) obj = Series([7, -5, 7, 4, 2, 0, 4]) # print obj.rank()
## 函数的应用和映射 frame = DataFrame(np.random.randn(4,3), columns=list('bde'), index=['Utah','Ohio','Texas','Oregon']) f = lambda x :x.max()-x.min() # 每列使用函数f frame.apply(f) # 每行使用函数f frame.apply(f, axis=1) # 返回多个值的函数 def f(x): return Series([x.min(), x.max()], index=['min','max']) frame.apply(f) # python的元素级的函数 format = lambda x: '%.2f' % x frame.applymap(format) ## 排序和排名 obj = Series(range(4),index=['d','a','b','c']) obj. () frame = DataFrame(np.arange(8).reshape((2,4)), index=['three','one'], columns=['d','a','b','c']) frame.sort_index() # 按列名排序 frame.sort_index(axis=1) # 降序排 frame.sort_index(axis=1, ascending=False) # 对值进行排序,这个只能对Series使用 obj = Series([4,7,-3,2]) obj.order() # 排序时缺失值都会被放在末尾
def summary(df: pd.DataFrame, title: str = '') -> Tuple[List[pd.DataFrame], BytesIO]: """Computes the summary for the given data frame. :return tuple list of data frame: one data frame for each step; the steps are: "outlay, outlay (expanded groups), outlay (single persons), outlay (summed for each pair of persons), expenses (per person), balances (per person), clearing", BytesIO: containing the HTML version of the summary, UTF-8 encoded """ date, item, creditor, debtor, amount = df.columns df = df[[creditor, debtor, item, amount]] df = df.applymap(str.strip) df[[amount]] = df[[amount]].applymap(float) df_groups = pd.DataFrame() notes = Notes(df.columns) notes.add(df, 'Outlay', index=[creditor, debtor, item]) df_expanded = df.copy() df_expanded[debtor] = df_expanded[debtor].map( partial(compute_recipients, groups=df_groups)) notes.add(df_expanded, 'Outlay (expanded)', index=[creditor, debtor, item]) amount_per_person = df_expanded[amount] / df_expanded[debtor].str.len() receiving_persons = df_expanded[debtor].apply(pd.Series, 1).stack() receiving_persons.index = receiving_persons.index.droplevel(-1) receiving_persons.name = debtor df_stacked = df_expanded.drop(columns=debtor) df_stacked[[amount]] = amount_per_person df_stacked = df_stacked.join(receiving_persons) df_stacked = df_stacked[[creditor, debtor, item, amount]] notes.add(df_stacked, 'Outlay (stacked)', index=[creditor, debtor, item]) df_summed = df_stacked.drop(columns=item).groupby([creditor, debtor]).sum() notes.add(df_summed, 'Outlay (summed)') df_expenses = df_summed.unstack(fill_value=0.) df_expenses.columns = df_expenses.columns.droplevel(0) total_received = df_expenses.sum(axis=0) total_received.name = 'Total (received)' df_expenses = df_expenses.append(total_received) total_paid = df_expenses.sum(axis=1) df_expenses['Total (paid)'] = total_paid total_paid.drop(index='Total (received)', inplace=True) notes.add(round_2(df_expenses), 'Expenses') balances = total_paid.subtract(total_received, fill_value=0.) balances.name = 'Balance' notes.add(round_2(balances), 'Balances') transfers = [] while balances.size > 1: creditor = balances.idxmax() debtor = balances.idxmin() paid = balances[creditor] received = abs(balances[debtor]) transfers.append((debtor, creditor, min(paid, received))) if paid >= received: balances[creditor] -= received balances.drop(index=debtor, inplace=True) else: balances[debtor] += paid balances.drop(index=creditor, inplace=True) df_transfers = pd.DataFrame(transfers, columns=['Writer', 'Recipient', 'Amount' ]).set_index(['Writer', 'Recipient' ]).sort_index() notes.add(round_2(df_transfers), 'Clearing') return notes.steps, notes.render(title)
def test_to_csv_date_format(self): from pandas import to_datetime with ensure_clean('__tmp_to_csv_date_format__') as path: for engine in [None, 'python']: w = FutureWarning if engine == 'python' else None dt_index = self.tsframe.index datetime_frame = DataFrame( {'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) with tm.assert_produces_warning(w, check_stacklevel=False): datetime_frame.to_csv( path, date_format='%Y%m%d', engine=engine) # Check that the data was put in the specified format test = read_csv(path, index_col=0) datetime_frame_int = datetime_frame.applymap( lambda x: int(x.strftime('%Y%m%d'))) datetime_frame_int.index = datetime_frame_int.index.map( lambda x: int(x.strftime('%Y%m%d'))) assert_frame_equal(test, datetime_frame_int) with tm.assert_produces_warning(w, check_stacklevel=False): datetime_frame.to_csv( path, date_format='%Y-%m-%d', engine=engine) # Check that the data was put in the specified format test = read_csv(path, index_col=0) datetime_frame_str = datetime_frame.applymap( lambda x: x.strftime('%Y-%m-%d')) datetime_frame_str.index = datetime_frame_str.index.map( lambda x: x.strftime('%Y-%m-%d')) assert_frame_equal(test, datetime_frame_str) # Check that columns get converted datetime_frame_columns = datetime_frame.T with tm.assert_produces_warning(w, check_stacklevel=False): datetime_frame_columns.to_csv( path, date_format='%Y%m%d', engine=engine) test = read_csv(path, index_col=0) datetime_frame_columns = datetime_frame_columns.applymap( lambda x: int(x.strftime('%Y%m%d'))) # Columns don't get converted to ints by read_csv datetime_frame_columns.columns = ( datetime_frame_columns.columns .map(lambda x: x.strftime('%Y%m%d'))) assert_frame_equal(test, datetime_frame_columns) # test NaTs nat_index = to_datetime( ['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) nat_frame = DataFrame({'A': nat_index}, index=nat_index) with tm.assert_produces_warning(w, check_stacklevel=False): nat_frame.to_csv( path, date_format='%Y-%m-%d', engine=engine) test = read_csv(path, parse_dates=[0, 1], index_col=0) assert_frame_equal(test, nat_frame)
# -*- coding: utf-8 -*- import numpy as np from pandas import Series, DataFrame frame = DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) print frame print np.abs(frame) print f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) def f(x): return Series([x.min(), x.max()], index=['min', 'max']) print frame.apply(f) print print 'applymap and map' _format = lambda x: '%.2f' % x print frame.applymap(_format) print frame['e'].map(_format)
def roundFloats(df: pd.DataFrame): return df.applymap(lambda x: roundTraditional(x, 2) if isinstance(x, float) else x)
def getFDArray(self): fd_dict = InitialFDLoader.getInitialFD() df = DataFrame(fd_dict).T self.X = df.applymap(lambda x: x[0]).values self.labels = list(df.index)
data2.sort_index(by=['clarity'], ascending=False) # groupby data4 = DataFrame( data[:100], columns=['cut', 'color', 'clarity', 'carat', 'price']) data4.groupby('clarity').mean() # 数値のものだけ集計される data4.groupby(['cut', 'clarity']).mean() # 2水準以上の場合 data4.groupby(['cut', 'clarity']).mean()['price'] # 結果に対してのアクセス # apply data5 = DataFrame(data[:6], columns=['carat', 'price', 'depth']) f = lambda x: x.max() - x.min() data5.apply(f) data5.apply(f, axis=1) # 行方向(デフォルトは列方向) f2 = lambda x: '%.2f' % x # 数値の書式を下2桁表示に変更 data5.applymap(f2) # データフレームの各要素に適用 # vlookup clarity_to_class = { 'SI1': 'A', 'SI2': 'B', 'VS1': 'C', 'VS2': 'D', 'VVS2': 'E'} data2['class'] = data2['clarity'].map(clarity_to_class) # DB:SELECT文 #import pandas.io.sql as sql #con = sqlite3.connect(':memory:') #sql.read_frame('select * from test', con) # データを書き出す-csv data.to_csv('output.csv') # データを書き出す-エクセル
class Scores(object): """ Parameters ---------- uri : str, optional modality : str, optional Returns ------- scores : `Scores` Examples -------- >>> s = Scores(uri='video', modality='speaker') >>> s[Segment(0,1), 's1', 'A'] = 0.1 >>> s[Segment(0,1), 's1', 'B'] = 0.2 >>> s[Segment(0,1), 's1', 'C'] = 0.3 >>> s[Segment(0,1), 's2', 'A'] = 0.4 >>> s[Segment(0,1), 's2', 'B'] = 0.3 >>> s[Segment(0,1), 's2', 'C'] = 0.2 >>> s[Segment(2,3), 's1', 'A'] = 0.2 >>> s[Segment(2,3), 's1', 'B'] = 0.1 >>> s[Segment(2,3), 's1', 'C'] = 0.3 """ @classmethod def from_df( cls, df, uri=None, modality=None, aggfunc=np.mean ): """ Parameters ---------- df : DataFrame Must contain the following columns: 'segment', 'track', 'label' and 'value' uri : str, optional Resource identifier modality : str, optional Modality aggfunc : func Value aggregation function in case of duplicate (segment, track, label) tuples Returns ------- """ dataframe = pivot_table( df, values=PYANNOTE_SCORE, index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL, aggfunc=aggfunc ) annotation = Annotation(uri=uri, modality=modality) for index, _ in dataframe.iterrows(): segment = Segment(*index[0]) track = index[1] annotation[segment, track] = '' labels = dataframe.columns return cls(uri=uri, modality=modality, annotation=annotation, labels=labels, values=dataframe.values) def __init__(self, uri=None, modality=None, annotation=None, labels=None, values=None, dtype=None): super(Scores, self).__init__() names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] if annotation: annotation = annotation.copy() index = Index( [s + (t, ) for s, t in annotation.itertracks()], name=names) else: annotation = Annotation(uri=uri, modality=modality) index = MultiIndex(levels=[list() for name in names], labels=[list() for name in names], names=names) self.annotation_ = annotation columns = None if labels is None else list(labels) data = None if values is None else np.array(values) dtype = np.float if values is None else values.dtype self.dataframe_ = DataFrame(data=data, dtype=dtype, index=index, columns=columns) self.hasChanged_ = True self.modality = modality self.uri = uri def copy(self): self._reindexIfNeeded() copied = self.__class__(uri=self.uri, modality=self.modality) copied.dataframe_ = self.dataframe_.copy() copied.annotation_ = self.annotation_.copy() copied.hasChanged_ = self.hasChanged_ return copied # del scores[segment] # del scores[segment, :] # del scores[segment, track] def __delitem__(self, key): if isinstance(key, Segment): segment = key self.dataframe_.drop(tuple(segment), axis=0, inplace=True) del self.annotation_[segment] self.hasChanged_ = True elif isinstance(key, tuple) and len(key) == 2: segment, track = key self.dataframe_.drop(tuple(segment) + (track, ), axis=0, inplace=True) del self.annotation_[segment, track] self.hasChanged_ = True else: raise KeyError('') # value = scores[segment, track, label] def __getitem__(self, key): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key return self.dataframe_.at[tuple(segment) + (track, ), label] # scores[segment, track, label] = value # scores[segment, label] ==== scores[segment, '_', label] def __setitem__(self, key, value): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key # do not add empty track if not segment: return self.dataframe_.at[tuple(segment) + (track,), label] = value self.annotation_[segment, track] = label self.hasChanged_ = True def __len__(self): """Number of annotated segments""" return len(self.annotation_) def __nonzero__(self): return self.__bool__() def __bool__(self): """False if annotation is empty""" return True if self.annotation_ else False def __contains__(self, included): """Check if segments are annotated Parameters ---------- included : `Segment` or `Timeline` Returns ------- contains : bool True if every segment in `included` is annotated, False otherwise. """ return included in self.annotation_ def __iter__(self): """Iterate over sorted segments""" return iter(self.annotation_.get_timeline()) def __reversed__(self): """Reverse iterate over sorted segments""" return reversed(self.annotation_.get_timeline()) def itersegments(self): return iter(self) def tracks(self, segment): """Set of tracks for query segment Parameters ---------- segment : `Segment` Query segment Returns ------- tracks : set Set of tracks for query segment """ return self.annotation_.get_tracks(segment) def has_track(self, segment, track): """Check whether a given track exists Parameters ---------- segment : `Segment` Query segment track : Query track Returns ------- exists : bool True if track exists for segment """ return self.annotation_.has_track(segment, track) def get_track_by_name(self, track): """Get all tracks with given name Parameters ---------- track : any valid track name Requested name track Returns ------- tracks : list List of (segment, track) tuples """ return self.annotation_.get_track_by_name(track) def new_track(self, segment, candidate=None, prefix=None): """Track name generator Parameters ---------- segment : Segment prefix : str, optional candidate : any valid track name Returns ------- track : str New track name """ return self.annotation_.new_track(segment, candidate=None, prefix=None) def itertracks(self): """Iterate over annotation as (segment, track) tuple""" return self.annotation_.itertracks() def itervalues(self): """Iterate over scores as (segment, track, label, value) tuple""" # make sure segment/track pairs are sorted self._reindexIfNeeded() labels = self.labels() # yield one (segment, track, label) tuple per loop for index, columns in self.dataframe_.iterrows(): segment = Segment(*index[:-1]) track = index[-1] for label in labels: value = columns[label] if not np.isnan(value): yield segment, track, label, value def get_track_scores(self, segment, track): """Get all scores for a given track. Parameters ---------- segment : Segment track : hashable segment, track must be a valid track Returns ------- scores : dict {label: score} dictionary """ return dict(self.dataframe_.xs(tuple(segment) + (track, ))) def labels(self, unknown=True): """List of labels Parameters ---------- unknown : bool, optional When False, do not return Unknown instances When True, return any label (even Unknown instances) Returns ------- labels : list Sorted list of existing labels Remarks ------- Labels are sorted based on their string representation. """ labels = sorted(self.dataframe_.columns, key=str) if unknown: return labels else: return [l for l in labels if not isinstance(l, Unknown)] def _reindexIfNeeded(self): if not self.hasChanged_: return names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index( [s + (t, ) for s, t in self.annotation_.itertracks()], name=names) self.dataframe_ = self.dataframe_.reindex(new_index) self.hasChanged_ = False return def retrack(self): """ """ self._reindexIfNeeded() retracked = self.copy() annotation = self.annotation_.retrack() retracked.annotation_ = annotation names = [PYANNOTE_SEGMENT + '_' + field for field in Segment._fields] + [PYANNOTE_TRACK] new_index = Index( [s + (t, ) for s, t in annotation.itertracks()], name=names) retracked.dataframe_.index = new_index return retracked def apply(self, func, axis=0): applied = self.copy() applied.dataframe_ = self.dataframe_.apply(func, axis=axis) applied.hasChanged_ = True return applied def rank(self, ascending=False): """ Parameters ---------- ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- rank : `Scores` """ ranked = self.copy() ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) ranked.hasChanged_ = True return ranked def nbest(self, n, ascending=False): """ Parameters ---------- n : int Size of n-best list ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- nbest : `Scores` New scores where only n-best are kept. """ filtered = self.copy() ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n, other=np.NaN) filtered.hasChanged_ = True return filtered def subset(self, labels, invert=False): """Scores subset Extract scores subset based on labels Parameters ---------- labels : set Set of labels invert : bool, optional If invert is True, extract all but requested `labels` Returns ------- subset : `Scores` Scores subset. """ self._reindexIfNeeded() if not isinstance(labels, set): raise TypeError('labels must be provided as a set of labels.') if invert: labels = set(self.labels()) - labels else: labels = labels & set(self.labels()) subset = Scores(uri=self.uri, modality=self.modality) subset.annotation_ = self.annotation_ subset.dataframe_ = self.dataframe_[list(labels)] return subset def to_annotation(self, threshold=-np.inf, posterior=False): """ Parameters ---------- threshold : float, optional Each track is annotated with the label with the highest score. Yet, if the latter is smaller than `threshold`, label is replaced with an `Unknown` instance. posterior : bool, optional If True, scores are posterior probabilities in open-set identification. If top model posterior is higher than unknown posterior, it is selected. Otherwise, label is replaced with an `Unknown` instance. """ if not self: return Annotation(uri=self.uri, modality=self.modality) best = self.nbest(1, ascending=False) large_enough = best.copy() if posterior: unknown_posterior = 1. - self.dataframe_.sum(axis=1) large_enough.dataframe_ = ( ((best.dataframe_.T > unknown_posterior) & (best.dataframe_.T > threshold)).T ) else: large_enough.dataframe_ = ( (best.dataframe_.T > threshold).T ) large_enough.dataframe_.where(best.dataframe_.notnull(), inplace=True, other=np.NaN) annotation = Annotation(uri=self.uri, modality=self.modality) for segment, track, label, value in large_enough.itervalues(): label = label if value else Unknown() annotation[segment, track] = label return annotation def map(self, func): """Apply function to all values""" mapped = self.copy() mapped.dataframe_ = self.dataframe_.applymap(func) mapped.hasChanged_ = True return mapped def crop(self, focus, mode='strict'): """Crop on focus Parameters ---------- focus : `Segment` or `Timeline` mode : {'strict', 'loose', 'intersection'} In 'strict' mode, only segments fully included in focus coverage are kept. In 'loose' mode, any intersecting segment is kept unchanged. In 'intersection' mode, only intersecting segments are kept and replaced by their actual intersection with the focus. Returns ------- cropped : same type as caller Cropped version of the caller containing only tracks matching the provided focus and mode. Remarks ------- In 'intersection' mode, the best is done to keep the track names unchanged. However, in some cases where two original segments are cropped into the same resulting segments, conflicting track names are modified to make sure no track is lost. """ if isinstance(focus, Segment): return self.crop(Timeline([focus], uri=self.uri), mode=mode) self._reindexIfNeeded() cropped = self.copy() if mode in ['strict', 'loose']: new_annotation = self.annotation_.crop(focus, mode=mode) keep = [new_annotation.has_track(segment, track) for segment, track in self.itertracks()] cropped.dataframe_ = self.dataframe_[keep] cropped.annotation_ = new_annotation cropped.hasChanged_ = True return cropped elif mode in ['intersection']: raise NotImplementedError('') # # two original segments might be cropped into the same resulting # # segment -- therefore, we keep track of the mapping # intersection, mapping = timeline.crop(coverage, # mode=mode, mapping=True) # # # create new empty annotation # A = self.__class__(uri=self.uri, modality=self.modality) # # for cropped in intersection: # for original in mapping[cropped]: # for track in self.tracks(original): # # try to use original track name (candidate) # # if it already exists, create a brand new one # new_track = A.new_track(cropped, candidate=track) # # copy each value, column by column # for label in self.dataframe_.columns: # value = self.dataframe_.get_value((original, track), # label) # A.dataframe_ = A.dataframe_.set_value((cropped, new_track), # label, value) # # return A def __str__(self): """Human-friendly representation""" if self: self._reindexIfNeeded() return str(self.dataframe_) else: return "" def _repr_png_(self): from .notebook import repr_scores return repr_scores(self)
def _df_contains_substring(df: pd.DataFrame, substring: str) -> bool: """Returns True if any entity in |df| contains the substring.""" df_contains_str_mask = df.applymap( lambda element: substring.lower() in str(element).lower()) return df_contains_str_mask.any().any()
def test_default_handler(self): value = object() frame = DataFrame({'a': ['a', value]}) expected = frame.applymap(str) result = pd.read_json(frame.to_json(default_handler=str)) assert_frame_equal(expected, result)
d=DataFrame( {'a':range(0,10),'b':range(10,20),'c':range(20,30)}, index=list(letters[:10]) ) #%% 返回值可是是一个数值 def f1(x):return x.max() d.apply(f1) #%% 返回值也可以是一个系列 def f2(x):return Series([x.max(),x.min()],index=[1,2]) d.apply(f2) #%% 但返回值不能是一个数据框 def f3(x): return DataFrame({'a':range(0,10),'b':range(10,20),'c':range(20,30)}) d.apply(f3) #%% 列表也不行 def f4(x):return [1,2,3] d.apply(f4) #%% 可以应用到每一个元素 def f5(x): return x%3 d.applymap(f5) #%% applymap的返回值可以使用列表 def f6(x): return [x,x%3] d.applymap(f6)
def test_default_handler(self): value = object() frame = DataFrame({'a': ['a', value]}) expected = frame.applymap(str) result = pd.read_json(frame.to_json(default_handler=str)) assert_frame_equal(expected, result, check_index_type=False)
obj5[obj5 < 5] = 3 print obj5 print obj5.ix["Ohio", ["one", "two"]] s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"]) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"]) print s1 + s2 df1 = DataFrame(np.arange(9).reshape((3, 3)), columns=list("bcd"), index=["Ohin", "Texa", "Colorado"]) df2 = DataFrame(np.arange(12).reshape((4, 3)), columns=list("bcd"), index=["Utah", "Ohin", "Texa", "Colorado"]) print df1 + df2 print df1.add(df2, fill_value=0) series2 = df2.ix[0] print df2 - series2 ff = lambda x: x.max() - x.min() print df2.apply(ff) print df2.apply(ff, axis=1) df3 = DataFrame(np.random.randn(3, 3), columns=list("bcd"), index=["Ohin", "Texa", "Colorado"]) ff2 = lambda x: "%.2f" % x print df3 print df3.applymap(ff2) print df3 print df3.sort_index(by="b")
df3['float_col'].fillna(np.mean) # Rellena el elmento vacio de la columna con la media print "Df rellenando el elemento vacio de las columnas con la media" print df3 print "*"*15 print "Definimos la función F" def f(x): if type(x) is str: return 'applymap_' + x elif x: return 100 * x else: return print "Aplicamos F al dataframe" df.applymap(f) print df print "*"*15 print "Definimos de nuevo el dataframe" df = pd.DataFrame(data={"A":[1,2], "B":[2.6,1.3]}) print df print "añadimos columnas combinando las actuales" df["C"] = df["A"]+df["B"] df["D"] = df["A"]*3 df["E"] = np.sqrt(df["A"]) print df print "*"*15 print "Datos disponibles de un dataframe" print " descripcion del dataframe" print df.describe()
f = lambda x:x.max() - x.min() print(frame.apply(f)) print('\n') print(frame.apply(f, axis=1)) print('\n') ############################################################### def f(x): return Series([x.min(), x.max()], index=['min','max']) print(frame.apply(f)) print('\n') format = lambda x:'%.2f'%x print(frame.applymap(format)) print('\n') print(frame['e'].map(format))
def f(x): # Series的元素的类型为Series return Series([x.min(), x.max()], index=['min', 'max']) print frame.apply(f) ''' A B C min 0 1 2 max 6 7 8 ''' print 'applymap和map:作用到每一个元素' _format = lambda x: '%.2f' % x print frame.applymap(_format) # 针对DataFrame ''' A B C a 0.00 1.00 2.00 b 3.00 4.00 5.00 c 6.00 7.00 8.00 ''' print frame['A'].map(_format) # 针对Series ''' a 0.00 b 3.00 c 6.00 Name: A, dtype: object ''' print 'Series排序'
def plotter(title, df, kind = 'line', x_label = None, y_label = None, style = 'ggplot', figsize = (8, 4), save = False, legend_pos = 'best', reverse_legend = 'guess', num_to_plot = 7, tex = 'try', colours = 'Accent', cumulative = False, pie_legend = True, partial_pie = False, show_totals = False, transparent = False, output_format = 'png', interactive = False, black_and_white = False, show_p_val = False, indices = False, **kwargs): """Visualise corpus interrogations. :param title: A title for the plot :type title: str :param df: Data to be plotted :type df: pandas.core.frame.DataFrame :param x_label: A label for the x axis :type x_label: str :param y_label: A label for the y axis :type y_label: str :param kind: The kind of chart to make :type kind: str ('line'/'bar'/'barh'/'pie'/'area') :param style: Visual theme of plot :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc) :param figsize: Size of plot :type figsize: tuple (int, int) :param save: If bool, save with *title* as name; if str, use str as name :type save: bool/str :param legend_pos: Where to place legend :type legend_pos: str ('upper right'/'outside right'/etc) :param reverse_legend: Reverse the order of the legend :type reverse_legend: bool :param num_to_plot: How many columns to plot :type num_to_plot: int/'all' :param tex: Use TeX to draw plot text :type tex: bool :param colours: Colourmap for lines/bars/slices :type colours: str :param cumulative: Plot values cumulatively :type cumulative: bool :param pie_legend: Show a legend for pie chart :type pie_legend: bool :param partial_pie: Allow plotting of pie slices only :type partial_pie: bool :param show_totals: Print sums in plot where possible :type show_totals: str -- 'legend'/'plot'/'both' :param transparent: Transparent .png background :type transparent: bool :param output_format: File format for saved image :type output_format: str -- 'png'/'pdf' :param black_and_white: Create black and white line styles :type black_and_white: bool :param show_p_val: Attempt to print p values in legend if contained in df :type show_p_val: bool :param indices: To use when plotting "distance from root" :type indices: bool :param stacked: When making bar chart, stack bars on top of one another :type stacked: str :param filled: For area and bar charts, make every column sum to 100 :type filled: str :param legend: Show a legend :type legend: bool :param rot: Rotate x axis ticks by *rot* degrees :type rot: int :param subplots: Plot each column separately :type subplots: bool :param layout: Grid shape to use when *subplots* is True :type layout: tuple -- (int, int) :param interactive: Experimental interactive options :type interactive: list -- [1, 2, 3] :returns: matplotlib figure """ import corpkit import os try: from IPython.utils.shimmodule import ShimWarning import warnings warnings.simplefilter('ignore', ShimWarning) except: pass import matplotlib as mpl from matplotlib import rc # prefer seaborn plotting try: import seaborn as sns except: pass if interactive: import matplotlib.pyplot as plt, mpld3 else: import matplotlib.pyplot as plt import pandas from pandas import DataFrame import numpy from time import localtime, strftime from tests import check_pytex, check_spider, check_t_kinter if interactive: import mpld3 import collections from mpld3 import plugins, utils from plugins import InteractiveLegendPlugin, HighlightLines # check what environment we're in tk = check_t_kinter() running_python_tex = check_pytex() running_spider = check_spider() def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100): """remove extreme values from colourmap --- no pure white""" import matplotlib.colors as colors import numpy as np new_cmap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))) return new_cmap def get_savename(imagefolder, save = False, title = False, ext = 'png'): """Come up with the savename for the image.""" import os def urlify(s): "Turn title into filename" import re s = s.lower() s = re.sub(r"[^\w\s-]", '', s) s = re.sub(r"\s+", '-', s) s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s) return s # name as if not ext.startswith('.'): ext = '.' + ext if type(save) == str: savename = os.path.join(imagefolder, (urlify(save) + ext)) #this 'else' is redundant now that title is obligatory else: if title: filename = urlify(title) + ext savename = os.path.join(imagefolder, filename) # remove duplicated ext if savename.endswith('%s%s' % (ext, ext)): savename = savename.replace('%s%s' % (ext, ext), ext, 1) return savename def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True): """adds totals (abs, rel, keyness) to entry name strings""" if was_series: where_the_words_are = dataframe.index else: where_the_words_are = dataframe.columns the_labs = [] for w in list(where_the_words_are): if not absolutes: if was_series: perc = dataframe.T[w][0] else: the_labs.append(w) continue if using_tex: the_labs.append('%s (%.2f\%%)' % (w, perc)) else: the_labs.append('%s (%.2f %%)' % (w, perc)) else: if was_series: score = dataframe.T[w].sum() else: score = dataframe[w].sum() if using_tex: the_labs.append('%s (n=%d)' % (w, score)) else: the_labs.append('%s (n=%d)' % (w, score)) if not was_series: dataframe.columns = the_labs else: vals = list(dataframe[list(dataframe.columns)[0]].values) dataframe = pandas.DataFrame(vals, index = the_labs) dataframe.columns = ['Total'] return dataframe def auto_explode(dataframe, input, was_series = False, num_to_plot = 7): """give me a list of strings and i'll output explode option""" output = [0 for s in range(num_to_plot)] if was_series: l = list(dataframe.index) else: l = list(dataframe.columns) if type(input) == str or type(input) == int: input = [input] if type(input) == list: for i in input: if type(i) == str: index = l.index(i) else: index = i output[index] = 0.1 return output # check if we're doing subplots sbplt = False if 'subplots' in kwargs: if kwargs['subplots'] is True: sbplt = True kwargs['subplots'] = sbplt if colours is True: colours = 'Paired' # todo: get this dynamically instead. styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white'] #if style not in styles: #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles))) if style == 'mpl-white': try: sns.set_style("whitegrid") except: pass style = 'matplotlib' if style is not False and style.startswith('seaborn'): colours = False # use 'draggable = True' to make a draggable legend dragmode = kwargs.get('draggable', False) kwargs.pop('draggable', None) if kwargs.get('savepath'): mpl.rcParams['savefig.directory'] = kwargs.get('savepath') kwargs.pop('savepath', None) mpl.rcParams['savefig.bbox'] = 'tight' mpl.rcParams.update({'figure.autolayout': True}) # try to use tex # TO DO: # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.latex.unicode'] = True if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if interactive: using_tex = False if show_totals is False: show_totals = 'none' # find out what kind of plot we're making, and enable # or disable interactive values if need be kwargs['kind'] = kind.lower() if interactive: if kwargs['kind'].startswith('bar'): interactive_types = [3] elif kwargs['kind'] == 'area': interactive_types = [2, 3] elif kwargs['kind'] == 'line': interactive_types = [2, 3] elif kwargs['kind'] == 'pie': interactive_types = None warnings.warn('Interactive plotting not yet available for pie plots.') else: interactive_types = [None] if interactive is False: interactive_types = [None] # find out if pie mode, add autopct format piemode = False if kind == 'pie': piemode = True # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' # copy data, make series into df dataframe = df.copy() was_series = False if type(dataframe) == pandas.core.series.Series: was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # attempt to convert x axis to ints: try: dataframe.index = [int(i) for i in list(dataframe.index)] except: pass # remove totals and tkinter order if not was_series and not all(x.lower() == 'total' for x in list(dataframe.columns)): for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): try: dataframe = dataframe.drop(name, axis = ax, errors = 'ignore') except: pass else: dataframe = dataframe.drop('tkintertable-order', errors = 'ignore') dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore') # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': def isint(x): try: a = float(x) b = int(a) except ValueError or OverflowError: return False else: return a == b if all([isint(x) is True for x in list(dataframe.columns)]): indices = True else: indices = False # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf'] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if piemode and not sbplt and kwargs.get('explode'): kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series = was_series, num_to_plot = num_to_plot) else: kwargs.pop('explode', None) legend = kwargs.get('legend', False) #cut data short plotting_a_totals_column = False if was_series: if list(dataframe.columns)[0] != 'Total': try: can_be_ints = [int(x) for x in list(dataframe.index)] num_to_plot = len(dataframe) except: dataframe = dataframe[:num_to_plot] elif list(dataframe.columns)[0] == 'Total': plotting_a_totals_column = True if not 'legend' in kwargs: legend = False num_to_plot = len(dataframe) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis = 1, errors = 'ignore') except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val: if there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] def p_string_formatter(val): if val < 0.001: if not using_tex: return 'p < 0.001' else: return r'p $<$ 0.001' else: return 'p = %s' % format(val, '.3f') pstr = p_string_formatter(pval) newname = '%s (%s)' % (col, pstr) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore') else: warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.') else: if there_are_p_vals: dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore') # make and set y label absolutes = True if type(dataframe) == pandas.core.frame.DataFrame: try: if not all([s.is_integer() for s in dataframe.iloc[0,:].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False # use colormap if need be: if num_to_plot > 0: if not was_series: if kind in ['pie', 'line', 'area']: if colours: if not plotting_a_totals_column: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours #else: if colours: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours if piemode: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours else: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours # multicoloured bar charts if colours: if kind.startswith('bar'): if len(list(dataframe.columns)) == 1: if not black_and_white: import numpy as np the_range = np.linspace(0, 1, num_to_plot) cmap = plt.get_cmap(colours) kwargs['colors'] = [cmap(n) for n in the_range] # make a bar width ... ? ... #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5 # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if kind in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kind == 'pie': if pie_legend: legend = True else: legend = False if kind in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # cut dataframe if just_totals try: tst = dataframe['Combined total'] dataframe = dataframe.head(num_to_plot) except: pass # rotate automatically if 'rot' not in kwargs: if not was_series: xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]] #if 'kind' in kwargs: #if kwargs['kind'] in ['barh', 'area']: #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] else: xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] if len(max(xvals, key=len)) > 6: if not piemode: kwargs['rot'] = 45 # no title for subplots because ugly, if title and not sbplt: kwargs['title'] = title # no interactive subplots yet: if sbplt and interactive: import warnings interactive = False warnings.warn('No interactive subplots yet, sorry.') return # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: if num_to_plot > 6: if not kwargs.get('ncol'): kwargs['ncol'] = num_to_plot / 7 # kwarg options go in leg_options leg_options = {'framealpha': .8, 'shadow': kwargs.get('shadow', False), 'ncol': kwargs.pop('ncol', 1)} # determine legend position based on this dict if legend_pos: possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left'} if type(legend_pos) == int: the_loc = legend_pos elif type(legend_pos) == str: try: the_loc = possible[legend_pos] except KeyError: raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(list(possible.keys()))) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if type(legend_pos) == str: if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) if piemode: if partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode: if not sbplt: kwargs['y'] = list(dataframe.columns)[0] if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.columns) else: if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.index) def filler(df): pby = df.T.copy() for i in list(pby.columns): tot = pby[i].sum() pby[i] = pby[i] * 100.0 / tot return pby.T areamode = False if kind == 'area': areamode = True if legend is False: kwargs['legend'] = False # line highlighting option for interactive! if interactive: if 2 in interactive_types: if kind == 'line': kwargs['marker'] = ',' if not piemode: kwargs['alpha'] = 0.1 # convert dates --- works only in my current case! if plotting_a_totals_column or not was_series: try: can_it_be_int = int(list(dataframe.index)[0]) can_be_int = True except: can_be_int = False if can_be_int: if 1500 < int(list(dataframe.index)[0]): if 2050 > int(list(dataframe.index)[0]): n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A') dataframe = dataframe.set_index(n) if kwargs.get('filled'): if areamode or kind.startswith('bar'): dataframe = filler(dataframe) kwargs.pop('filled', None) MARKERSIZE = 4 COLORMAP = { 0: {'marker': None, 'dash': (None,None)}, 1: {'marker': None, 'dash': [5,5]}, 2: {'marker': "o", 'dash': (None,None)}, 3: {'marker': None, 'dash': [1,3]}, 4: {'marker': "s", 'dash': [5,2,5,2,5,10]}, 5: {'marker': None, 'dash': [5,3,1,2,1,10]}, 6: {'marker': 'o', 'dash': (None,None)}, 7: {'marker': None, 'dash': [5,3,1,3]}, 8: {'marker': "1", 'dash': [1,3]}, 9: {'marker': "*", 'dash': [5,5]}, 10: {'marker': "2", 'dash': [5,2,5,2,5,10]}, 11: {'marker': "s", 'dash': (None,None)} } HATCHES = { 0: {'color': '#dfdfdf', 'hatch':"/"}, 1: {'color': '#6f6f6f', 'hatch':"\\"}, 2: {'color': 'b', 'hatch':"|"}, 3: {'color': '#dfdfdf', 'hatch':"-"}, 4: {'color': '#6f6f6f', 'hatch':"+"}, 5: {'color': 'b', 'hatch':"x"} } if black_and_white: if kind == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kind == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs['colormap'] = new_cmap class dummy_context_mgr(): """a fake context for plotting without style perhaps made obsolete by 'classic' style in new mpl""" def __enter__(self): return None def __exit__(self, one, two, three): return False with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr(): if not sbplt: # check if negative values, no stacked if so if areamode: kwargs['legend'] = False if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False ax = dataframe.plot(figsize = figsize, **kwargs) if areamode: handles, labels = plt.gca().get_legend_handles_labels() del handles del labels else: plt.gcf().set_tight_layout(False) if not piemode: ax = dataframe.plot(figsize = figsize, **kwargs) else: ax = dataframe.plot(figsize = figsize, **kwargs) handles, labels = plt.gca().get_legend_handles_labels() plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1), bbox_transform = plt.gcf().transFigure ) # this line allows layouts with missing plots # i.e. layout = (5, 2) with only nine plots plt.gcf().set_tight_layout(False) if 'rot' in kwargs: if kwargs['rot'] != 0 and kwargs['rot'] != 90: labels = [item.get_text() for item in ax.get_xticklabels()] ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white: if kind == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(list(COLORMAP.keys())): c = 0 # draw legend with proper placement etc if legend: if not piemode and not sbplt: if 3 not in interactive_types: handles, labels = plt.gca().get_legend_handles_labels() # area doubles the handles and labels. this removes half: if areamode: handles = handles[-len(handles) / 2:] labels = labels[-len(labels) / 2:] if rev_leg: handles = handles[::-1] labels = labels[::-1] lgd = plt.legend(handles, labels, **leg_options) if interactive: # 1 = highlight lines # 2 = line labels # 3 = legend switches ax = plt.gca() # fails for piemode lines = ax.lines handles, labels = plt.gca().get_legend_handles_labels() if 1 in interactive_types: plugins.connect(plt.gcf(), HighlightLines(lines)) if 3 in interactive_types: plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0)) for i, l in enumerate(lines): y_vals = l.get_ydata() x_vals = l.get_xdata() x_vals = [str(x) for x in x_vals] if absolutes: ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] else: ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] if 2 in interactive_types: #if 'kind' in kwargs and kwargs['kind'] == 'area': tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i]) mpld3.plugins.connect(plt.gcf(), tooltip_line) #else: if kind == 'line': tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls) mpld3.plugins.connect(plt.gcf(), tooltip_point) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' if x_label is not False: if type(x_label) == str: plt.xlabel(x_label) else: check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count) try: if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' check_x_axis = int(check_x_axis) if 1500 < check_x_axis < 2050: x_label = 'Year' else: x_label = 'Group' except: x_label = 'Group' if not sbplt: if not piemode: plt.xlabel(x_label) def is_number(s): """check if str can be can be made into float/int""" try: float(s) # for int, long and float except ValueError: try: complex(s) # for complex except ValueError: return False return True # for now, always turn off sci notation from matplotlib.ticker import ScalarFormatter if type(dataframe.index) != pandas.tseries.period.PeriodIndex: try: if all(is_number(s) for s in list(dataframe.index)): plt.gca().xaxis.set_major_formatter(ScalarFormatter()) except: pass try: if all(is_number(s) for s in list(dataframe.columns)): plt.gca().yaxis.set_major_formatter(ScalarFormatter()) except: pass # y labelling y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' def suplabel(axis,label,label_prop=None, labelpad=5, ha='center',va='center'): ''' Add super ylabel or xlabel to the figure Similar to matplotlib.suptitle axis - string: "x" or "y" label - string label_prop - keyword dictionary for Text labelpad - padding from the axis (default: 5) ha - horizontal alignment (default: "center") va - vertical alignment (default: "center") ''' fig = plt.gcf() xmin = [] ymin = [] for ax in fig.axes: xmin.append(ax.get_position().xmin) ymin.append(ax.get_position().ymin) xmin,ymin = min(xmin),min(ymin) dpi = fig.dpi if axis.lower() == "y": rotation=90. x = xmin-float(labelpad)/dpi y = 0.5 elif axis.lower() == 'x': rotation = 0. x = 0.5 y = ymin - float(labelpad)/dpi else: raise Exception("Unexpected axis: x or y") if label_prop is None: label_prop = dict() plt.gcf().text(x,y,label,rotation=rotation, transform=fig.transFigure, ha=ha,va=va, **label_prop) if y_label is not False: if not sbplt: if not piemode: if type(y_label) == str: plt.ylabel(y_label) else: plt.ylabel(y_l) else: if type(y_label) == str: the_y = y_label else: the_y = y_l #suplabel('y', the_y, labelpad = 1.5) plt.gcf().text(0.04, 0.5, the_y, va='center', rotation='vertical') #plt.subplots_adjust(left=0.5) # if not piemode: # if type(y_label) == str: # plt.ylabel(y_label) # else: # plt.ylabel(y_l) # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.gca().suptitle(title, fontsize = 16) #plt.subplots_adjust(top=0.9) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') # show grid a.grid(b=kwargs.get('grid', False)) kwargs.pop('grid', None) # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: if kind.startswith('bar'): width = ax.containers[0][0].get_width() # show grid ax.grid(b=kwargs.get('grid', False)) kwargs.pop('grid', None) if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0,the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') plt.subplots_adjust(left=0.1) plt.subplots_adjust(bottom=0.18) if 'layout' not in kwargs: if not sbplt: plt.tight_layout() if save: import os if running_python_tex: imagefolder = '../images' else: imagefolder = 'images' savename = get_savename(imagefolder, save = save, title = title, ext = output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o'): plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format) else: plt.gcf().savefig(savename, dpi=150, format = output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print('\n' + time + ": " + savename + " created.") else: raise ValueError("Error making %s." % savename) if dragmode: plt.legend().draggable() if sbplt: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) if not interactive and not running_python_tex and not running_spider \ and not tk: plt.gcf().show() return elif running_spider or tk: return plt if interactive: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) try: ax.legend_.remove() except: pass return mpld3.display()
def main(): # reindex obj = Series(range(4), index="a b c d".split(" ")[::-1]) print obj obj2 = obj.reindex("a b c d e".split(" ")) print obj2 # Change NaN print obj.reindex("a b c d e".split(" "), fill_value=0) colors = ["blue", "purple", "yellow"] index = [0, 2, 4] obj3 = Series(colors, index=index) print obj3.reindex(range(6)) print obj3.reindex(range(6), method="ffill") # not found forward fill print obj3.reindex(range(6), method="backfill") # bfill # DataFrame states = ["Ohio", "Texas", "California"] frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"]) print frame frame2 = frame.reindex("a b c d".split(" ")) print frame2 states[0] = "Utah" states[1], states[0] = states[:2] print frame.reindex(columns=states) # fill print frame.reindex("a b c d".split(" "), method="ffill", columns=states) print frame.ix["a b c d".split(" ")] print frame.ix["a b c d".split(" "), states] # Delete column print "", "" obj = Series(range(5), index="a b c d e".split(" ")) new_obj = obj.drop("c") print new_obj print obj # Index reference print "", "" obj = Series(np.arange(4.0), index="a b c d".split(" ")) print obj["b"] print obj[1] # same print obj[2:4] print obj[["b", "a", "c"]] print obj[[1, 3]] print obj[obj < 2] # Slice with label print obj["b":"c"] # include 'c' obj["b":"c"] = 5 print obj data = DataFrame( np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"], ) print data # column print data["two"] print data[["three", "one"]] # row print data[:2] print data[data["three"] > 5] # all values print data < 5 data[data < 5] = 0 print data # row and column print data.ix[["Colorado"], ["two", "three"]] print data.ix[["Colorado", "Utah"], [3, 0, 1]] # row print data.ix[2] # label row and column, return column print data.ix[:"Utah", "two"] # xs # row print data.xs("Utah") print data.xs("Utah", axis=0) # rows print data.xs("two", axis=1) # icol/irow i is index print data.icol(1) print data.irow(1) # Union print "", "" s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"]) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"]) print s1 print s2 # index is union, but d, f, g are NaN print s1 + s2 df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"]) df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print df1 print df2 print df1 + df2 # arithmetic method print "", "" df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd")) df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde")) print df1 print df2 print df1.add(df2, fill_value=0) # reindex has fill_value argument # other arithmetic method are sub/div/mul(ti) # Calculation in a DataFrame and Series print "", "" # subtract from each row. broadcat arr = np.arange(12.0).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) series = frame.ix[0] print frame print series print frame - series series2 = Series(range(3), index=list("bef")) print frame + series2 series3 = frame["d"] series4 = frame.ix[0] print frame print series3 print series4 print frame.sub(series3, axis=0) print frame.sub(series4, axis=1) # apply function and mapping print "", "" frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print frame f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) f = lambda x: Series([x.min(), x.max()], index=["min", "max"]) print frame.apply(f) format = lambda x: "{0:.2f}".format(x) print frame.applymap(format) # frame print frame["e"].map(format) # series # sort and rank print "", "" obj = Series(range(4), index=list("dabc")) print obj print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc")) print frame print frame.sort_index() print frame.sort_index(axis=1) print frame.sort_index(axis=1, ascending=False) # Sorting series print "", "" obj = Series([4, 7, -3, 2]) print obj.order() obj = Series([4, np.nan, 7, np.nan, -3, 2]) print obj.order() print obj.order(ascending=False) # order by multi columns print "", "" frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]}) print frame.sort_index(by=["a", "b"]) # rank print "", "" obj = Series([7, -5, 7, 4, 2, 0, 4]) print obj.rank() # method is average print obj.rank(method="first") # No Duplicates print obj.rank(ascending=False, method="min") print obj.rank(ascending=False, method="max") f1 = DataFrame(obj, columns=["data"]) f2 = DataFrame(obj.rank(), columns=["rank"]) # merge by each index print pd.merge(f1, f2, left_index=True, right_index=True) # Index of the axis with duplicate values print "", "" obj = Series(range(5), index=list("aaabc")) print obj print obj.index.is_unique print obj["a"] print obj["c"] df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd")) print df print df.ix["b"] print df["c"]
df3 = df.copy() mean = df3["float_col"].mean() print(df3) print(df3["float_col"].fillna(mean)) ### 4. Map, Apply ### print("---") ### print(df["str_col"].dropna().map(lambda x: "map_" + x)) print(df.ix[:, ["int_col", "float_col"]].apply(np.sqrt)) print(df.ix[:, ["int_col", "float_col"]].apply(np.sum)) print(df.applymap(some_fn)) ### Vectorized mathematical and string operations ### df = pd.DataFrame(data={"A": [1, 2], "B": [1.2, 1.3]}) df["C"] = df["A"] + df["B"] print(df) df["D"] = df["A"] * 3 print(df) df["E"] = np.sqrt(df["A"])
# 7.8 NA 치환 df1 = DataFrame({'col1':[1, NA, 2, NA, 3], 'col2':['a','b','c','d',NA]}) # 1) np.where np.where(pd.isnull(df1.col2), 'e', df1.col2) # 1차원(컬럼) 가능 np.where(pd.isnull(df1), 'e', df1) # 2차원 가능 # 2) 조건 치환 df1.col2[pd.isnull(df1.col2)] = 'e' # 1차원 직접 수정 방식 df1[pd.isnull(df1)] = 'e' # 2차원 직접 수정 방식 불가 df1.loc[pd.isnull(df1)] = 'e' # 2차원 직접 수정 방식 불가 # 3) 적용함수의 사용 df1.col2.map(lambda x : 'e' if pd.isnull(x) else x) # 1차원 map으로 가능 df1.applymap(lambda x : 'e' if pd.isnull(x) else x) # 2차원 applymap 가능 # 4) NA 치환 함수 df1.col2.fillna('e') # 1차원(Series) 데이터셋 NA 치환 가능 df1.fillna('e') # 2차원(DataFrame) 데이터셋 NA 치환 가능 df1.fillna({'col1':0, 'col2':'e'}) # 딕셔너리 전달로 컬럼별 서로 다른 값 치환 df1.fillna(method='ffill') # 이전 값으로의 치환 df1.fillna(method='bfill') # 다음 값으로의 치환 # 5) pandas replace 메서드 활용(밑에 정리) df1.replace(NA,0) # 7.9 replace 메서드 # 1) 문자열 메서드 형태(기본 파이썬 제공)