def editor(dataframe1, operation = None, dataframe2 = False, sort_by = False, keep_stats = False, keep_top = False, just_totals = False, threshold = 'medium', just_entries = False, skip_entries = False, merge_entries = False, newname = 'combine', multiple_merge = False, just_subcorpora = False, skip_subcorpora = False, span_subcorpora = False, merge_subcorpora = False, new_subcorpus_name = False, replace_names = False, projection = False, remove_above_p = False, p = 0.05, revert_year = True, print_info = True, spelling = False, selfdrop = True, calc_all = True, **kwargs ): """Edit results of interrogations, do keywording, sort, etc. ``just/skip_entries`` and ``just/skip_subcorpora`` can take a few different kinds of input: * str: treated as regular expression to match * list: * of integers: indices to match * of strings: entries/subcorpora to match ``merge_entries`` and ``merge_subcorpora``, however, are best entered as dicts: ``{newname: criteria, newname2: criteria2}``` where criteria is a string, list, etc. :param dataframe1: Results to edit :type dataframe1: pandas.core.frame.DataFrame :param operation: Kind of maths to do on inputted lists: '+', '-', '/', '*', '%': self explanatory 'k': log likelihood (keywords) 'a': get distance metric (for use with interrogator 'a' option) 'd': get percent difference (alternative approach to keywording) :type operation: str :param dataframe2: List of results or totals. If list of results, for each entry in dataframe 1, locate entry with same name in dataframe 2, and do maths there if 'self', do all merging/keeping operations, then use edited dataframe1 as dataframe2 :type dataframe2: pandas.core.series.Series/pandas.core.frame.DataFrame/dict/'self' :param sort_by: Calculate slope, stderr, r, p values, then sort by: increase: highest to lowest slope value decrease: lowest to highest slope value turbulent: most change in y axis values static: least change in y axis values total/most: largest number first infreq/least: smallest number first name: alphabetically :type sort_by: str :param keep_stats: Keep/drop stats values from dataframe after sorting :type keep_stats: bool :param keep_top: After sorting, remove all but the top *keep_top* results :type keep_top: int :param just_totals: Sum each column and work with sums :type just_totals: bool :param threshold: When using results list as dataframe 2, drop values occurring fewer than n times. If not keywording, you can use: ``'high'``: dataframe2 total / 2500 ``'medium'``: dataframe2 total / 5000 ``'low'``: dataframe2 total / 10000 Note: if keywording, there are smaller default thresholds :type threshold: int/bool :param just_entries: Keep matching entries :type just_entries: see above :param skip_entries: Skip matching entries :type skip_entries: see above :param merge_entries: Merge matching entries :type merge_entries: see above :param newname: New name for merged entries :type newname: str/'combine' :param just_subcorpora: Keep matching subcorpora :type just_subcorpora: see above :param skip_subcorpora: Skip matching subcorpora :type skip_subcorpora: see above :param span_subcorpora: If subcorpora are numerically named, span all from *int* to *int2*, inclusive :type span_subcorpora: tuple -- ``(int, int2)`` :param merge_subcorpora: Merge matching subcorpora :type merge_subcorpora: see above :param new_subcorpus_name: Name for merged subcorpora :type new_subcorpus_name: str/``'combine'`` :param replace_names: Edit result names and then merge duplicate names. :type replace_names: dict -- ``{criteria: replacement_text}``; str -- a regex to delete from names :param projection: a to multiply results in subcorpus by n :type projection: tuple -- ``(subcorpus_name, n)`` :param remove_above_p: Delete any result over p :type remove_above_p: bool :param p: set the p value :type p: float :param revert_year: when doing linear regression on years, turn annual subcorpora into 1, 2 ... :type revert_year: bool :param print_info: Print stuff to console showing what's being edited :type print_info: bool :param spelling: Convert/normalise spelling: :type spelling: str -- ``'US'``/``'UK'`` :param selfdrop: When keywording, try to remove target corpus from reference corpus :type selfdrop: bool :param calc_all: When keywording, calculate words that appear in either corpus :type calc_all: bool :returns: Edited interrogation """ # grab arguments, in case we get dict input and have to iterate saved_args = locals() import corpkit import pandas import signal import re import collections import pandas as pd import numpy as np from pandas import DataFrame, Series from time import localtime, strftime try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: have_ipython = False try: from IPython.display import display, clear_output except ImportError: pass # if passing a multiquery, do each result separately and return if type(dataframe1) == dict: outdict = {} from corpkit.editor import editor del saved_args['dataframe1'] for i, (k, v) in enumerate(dataframe1.items()): # only print the first time around if i == 0: pass #saved_args['print_info'] = True else: saved_args['print_info'] = False # if df2 is also a dict, get the relevant entry if type(dataframe2) == dict: if sorted(set([i.lower() for i in dataframe1.keys()])) == \ sorted(set([i.lower() for i in dataframe2.keys()])): saved_args['dataframe2'] = dataframe2[k] if 'use_df2_totals' in kwargs.keys(): if kwargs['use_df2_totals'] is True: saved_args['dataframe2'] = dataframe2[k].totals outdict[k] = editor(v.results, **saved_args) if print_info: from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print "\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (thetime, "'\n '".join(sorted(outdict.keys()))) return outdict the_time_started = strftime("%Y-%m-%d %H:%M:%S") pd.options.mode.chained_assignment = None pd.set_option('display.float_format', lambda x: '%.2f' % x) from corpkit.tests import check_pytex if check_pytex(): print_info = False def combiney(df, df2, operation = '%', threshold = 'medium', prinf = True): """mash df and df2 together in appropriate way""" totals = False # delete under threshold if just_totals: if using_totals: if not single_totals: to_drop = list(df2[df2['Combined total'] < threshold].index) df = df.drop([e for e in to_drop if e in list(df.index)]) if prinf: to_show = [] [to_show.append(w) for w in to_drop[:5]] if len(to_drop) > 10: to_show.append('...') [to_show.append(w) for w in to_drop[-5:]] if len(to_drop) > 0: print 'Removing %d entries below threshold:\n %s' % (len(to_drop), '\n '.join(to_show)) if len(to_drop) > 10: print '... and %d more ... \n' % (len(to_drop) - len(to_show) + 1) else: print '' else: denom = df2 else: denom = list(df2) if single_totals: if operation == '%': totals = df.sum() * 100.0 / float(df.sum().sum()) df = df * 100.0 try: df = df.div(denom, axis = 0) except ValueError: from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime elif operation == '+': try: df = df.add(denom, axis = 0) except ValueError: from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime elif operation == '-': try: df = df.sub(denom, axis = 0) except ValueError: from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime elif operation == '*': totals = df.sum() * float(df.sum().sum()) try: df = df.mul(denom, axis = 0) except ValueError: from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime elif operation == '/': try: totals = df.sum() / float(df.sum().sum()) df = df.div(denom, axis = 0) except ValueError: from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime elif operation == 'd': #df.ix['Combined total'] = df.sum() #to_drop = to_drop = list(df.T[df.T['Combined total'] < threshold].index) to_drop = [n for n in list(df.columns) if df[n].sum() < threshold] df = df.drop([e for e in to_drop if e in list(df.columns)], axis = 1) #df.drop('Combined total') if prinf: to_show = [] [to_show.append(w) for w in to_drop[:5]] if len(to_drop) > 10: to_show.append('...') [to_show.append(w) for w in to_drop[-5:]] if len(to_drop) > 0: print 'Removing %d entries below threshold:\n %s' % (len(to_drop), '\n '.join(to_show)) if len(to_drop) > 10: print '... and %d more ... \n' % (len(to_drop) - len(to_show) + 1) else: print '' # get normalised num in target corpus norm_in_target = df.div(denom, axis = 0) # get normalised num in reference corpus, with or without selfdrop tot_in_ref = df.copy() for c in list(tot_in_ref.index): if selfdrop: tot_in_ref.ix[c] = df.sum() - tot_in_ref.ix[c] else: tot_in_ref.ix[c] = df.sum() norm_in_ref = tot_in_ref.div(df.sum().sum()) df = (norm_in_target - norm_in_ref) / norm_in_ref * 100.0 df = df.replace(float(-100.00), np.nan) elif operation == 'a': for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis = 1) / df2 elif operation.startswith('c'): import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") df = pandas.concat([df, df2], axis = 1) return df, totals elif not single_totals: if not operation.startswith('a'): # generate totals if operation == '%': totals = df.sum() * 100.0 / float(df2.sum().sum()) if operation == '*': totals = df.sum() * float(df2.sum().sum()) if operation == '/': totals = df.sum() / float(df2.sum().sum()) if operation.startswith('c'): # add here the info that merging will not work # with identical colnames import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") d = pd.concat([df.T, df2.T]).sort() # make index nums d = d.reset_index() # sum and remove duplicates d = d.groupby('index').sum() dx = d.reset_index('index') dx.index = list(dx['index']) df = dx.drop('index', axis = 1).T for index, entry in enumerate(list(df.columns)): #p.animate(index) if operation == '%': try: df[entry] = df[entry] * 100.0 / df2[entry] except: continue #df.drop(entry, axis = 1, inplace = True) #df[entry] = maths_done elif operation == '+': try: df[entry] = df[entry] + df2[entry] except: continue elif operation == '-': try: df[entry] = df[entry] - df2[entry] except: continue elif operation == '*': try: df[entry] = df[entry] * df2[entry] except: continue elif operation == '/': try: df[entry] = df[entry] / df2[entry] except: continue else: for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis = 1) / df2.T.sum() return df, totals def parse_input(df, the_input): """turn whatever has been passed in into list of words that can be used as pandas indices---maybe a bad way to go about it""" # FIX MERGE ERROR HERE parsed_input = False import re if the_input == 'all': the_input = r'.*' if type(the_input) == int: try: the_input = str(the_input) except: pass the_input = [the_input] elif type(the_input) == str or type(the_input) == unicode: try: regex = re.compile(the_input) parsed_input = [w for w in list(df) if re.search(regex, w)] return parsed_input except: the_input = [the_input] if type(the_input) == list: if type(the_input[0]) == int: parsed_input = [word for index, word in enumerate(list(df)) if index in the_input] elif type(the_input[0]) == str or type(the_input[0]) == unicode: try: parsed_input = [word for word in the_input if word in df.columns] except AttributeError: # if series parsed_input = [word for word in the_input if word in df.index] return parsed_input def synonymise(df, pos = 'n'): """pass a df and a pos and convert df columns to most common synonyms""" from nltk.corpus import wordnet as wn #from dictionaries.taxonomies import taxonomies from collections import Counter fixed = [] for w in list(df.columns): try: syns = [] for syns in wn.synsets(w, pos = pos): for w in syns: synonyms.append(w) top_syn = Counter(syns).most_common(1)[0][0] fixed.append(top_syn) except: fixed.append(w) df.columns = fixed return df def convert_spell(df, convert_to = 'US', print_info = print_info): """turn dataframes into us/uk spelling""" from dictionaries.word_transforms import usa_convert if print_info: print 'Converting spelling ... \n' if convert_to == 'UK': usa_convert = {v: k for k, v in usa_convert.items()} fixed = [] for val in list(df.columns): try: fixed.append(usa_convert[val]) except: fixed.append(val) df.columns = fixed return df def merge_duplicates(df, print_info = print_info): if print_info: print 'Merging duplicate entries ... \n' # now we have to merge all duplicates for dup in df.columns.get_duplicates(): #num_dupes = len(list(df[dup].columns)) temp = df[dup].sum(axis = 1) #df = df.drop([dup for d in range(num_dupes)], axis = 1) df = df.drop(dup, axis = 1) df[dup] = temp return df def name_replacer(df, replace_names, print_info = print_info): """replace entry names and merge""" import re # double or single nest if need be if type(replace_names) == str: replace_names = [(replace_names, '')] if type(replace_names) != dict: if type(replace_names[0]) == str: replace_names = [replace_names] if type(replace_names) == dict: replace_names = [(v, k) for k, v in replace_names.items()] for to_find, replacement in replace_names: if print_info: try: print 'Replacing "%s" with "%s" ...\n' % (to_find, replacement) except: print 'Deleting "%s" from entry names ...\n' % (to_find) to_find = re.compile(to_find) try: replacement = replacement except: replacement = '' df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)] df = merge_duplicates(df, print_info = False) return df def just_these_entries(df, parsed_input, prinf = True): entries = [word for word in list(df) if word not in parsed_input] if prinf: print 'Keeping %d entries:\n %s' % (len(parsed_input), '\n '.join(parsed_input[:10])) if len(parsed_input) > 10: print '... and %d more ... \n' % (len(parsed_input) - 10) else: print '' df = df.drop(entries, axis = 1) return df def skip_these_entries(df, parsed_input, prinf = True): if prinf: print 'Skipping %d entries:\n %s' % (len(parsed_input), '\n '.join(parsed_input[:10])) if len(parsed_input) > 10: print '... and %d more ... \n' % (len(parsed_input) - 10) else: print '' df = df.drop(parsed_input, axis = 1) return df def newname_getter(df, parsed_input, newname = 'combine', prinf = True, merging_subcorpora = False): """makes appropriate name for merged entries""" if merging_subcorpora: if newname is False: newname = 'combine' if type(newname) == int: the_newname = list(df.columns)[newname] elif type(newname) == str: if newname == 'combine': if len(parsed_input) <= 3: the_newname = '/'.join(parsed_input) elif len(parsed_input) > 3: the_newname = '/'.join(parsed_input[:3]) + '...' else: the_newname = newname if newname is False: # revise this code import operator sumdict = {} for item in parsed_input: summed = sum(list(df[item])) sumdict[item] = summed the_newname = max(sumdict.iteritems(), key=operator.itemgetter(1))[0] if type(the_newname) != unicode: the_newname = unicode(the_newname, errors = 'ignore') return the_newname def merge_these_entries(df, parsed_input, the_newname, prinf = True, merging = 'entries'): # make new entry with sum of parsed input if len(parsed_input) == 0: import warnings warnings.warn('No %s could be automatically merged.\n' % merging) else: if prinf: print 'Merging %d %s as "%s":\n %s' % (len(parsed_input), merging, the_newname, '\n '.join(parsed_input[:10])) if len(parsed_input) > 10: print '... and %d more ... \n' % (len(parsed_input) - 10) else: print '' # remove old entries temp = sum([df[i] for i in parsed_input]) if not multiple_merge: if type(df) == pandas.core.series.Series: df = df.drop(parsed_input) else: df = df.drop(parsed_input, axis = 1) df[the_newname] = temp return df def just_these_subcorpora(df, lst_of_subcorpora, prinf = True): if type(lst_of_subcorpora[0]) == int: lst_of_subcorpora = [str(l) for l in lst_of_subcorpora] good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora] if prinf: print 'Keeping %d subcorpora:\n %s' % (len(good_years), '\n '.join(good_years[:10])) if len(good_years) > 10: print '... and %d more ... \n' % (len(good_years) - 10) else: print '' df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis = 0) return df def skip_these_subcorpora(df, lst_of_subcorpora, prinf = True): if type(lst_of_subcorpora) == int: lst_of_subcorpora = [lst_of_subcorpora] if type(lst_of_subcorpora[0]) == int: lst_of_subcorpora = [str(l) for l in lst_of_subcorpora] bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora] if len(bad_years) == 0: import warnings warnings.warn('No subcorpora skipped.\n') else: if prinf: print 'Skipping %d subcorpora:\n %s' % (len(bad_years), '\n '.join([str(i) for i in bad_years[:10]])) if len(bad_years) > 10: print '... and %d more ... \n' % (len(bad_years) - 10) else: print '' df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis = 0) return df def span_these_subcorpora(df, lst_of_subcorpora, prinf = True): """select only a span of numerical suborpora (first, last)""" non_totals = [subcorpus for subcorpus in list(df.index)] good_years = [subcorpus for subcorpus in non_totals if int(subcorpus) >= int(lst_of_subcorpora[0]) and int(subcorpus) <= int(lst_of_subcorpora[-1])] if len(lst_of_subcorpora) == 0: import warnings warnings.warn('Span not identified.\n') else: if prinf: print 'Keeping subcorpora:\n %d--%d\n' % (int(lst_of_subcorpora[0]), int(lst_of_subcorpora[-1])) df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis = 0) # retotal needed here return df def projector(df, list_of_tuples, prinf = True): """project abs values""" if type(list_of_tuples) == list: tdict = {} for a, b in list_of_tuples: tdict[a] = b list_of_tuples = tdict for subcorpus, projection_value in list_of_tuples.items(): if type(subcorpus) == int: subcorpus = str(subcorpus) df.ix[subcorpus] = df.ix[subcorpus] * projection_value if prinf: if type(projection_value) == float: print 'Projection: %s * %s' % (subcorpus, projection_value) if type(projection_value) == int: print 'Projection: %s * %d' % (subcorpus, projection_value) if prinf: print '' return df def do_stats(df): """do linregress and add to df""" try: from scipy.stats import linregress except ImportError: from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: sort type not available in this verion of corpkit.' % thetime return False #from stats.stats import linregress entries = [] slopes = [] intercepts = [] rs = [] ps = [] stderrs = [] indices = list(df.index) first_year = list(df.index)[0] try: x = [int(y) - int(first_year) for y in indices] except ValueError: x = range(len(indices)) statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] for entry in list(df.columns): entries.append(entry) y = list(df[entry]) slope, intercept, r, p, stderr = linregress(x, y) slopes.append(slope) intercepts.append(intercept) rs.append(r) ps.append(p) stderrs.append(stderr) sl = pd.DataFrame([slopes, intercepts, rs, ps, stderrs], index = statfields, columns = list(df.columns)) df = df.append(sl) # drop infinites and nans if operation != 'd': df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) return df def recalc(df, operation = '%'): statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] """Add totals to the dataframe1""" #df.drop('Total', axis = 0, inplace = True) #df.drop('Total', axis = 1, inplace = True) try: df['temp-Total'] = df.drop(statfields).sum(axis = 1) except: df['temp-Total'] = df.sum(axis = 1) df = df.T try: df['temp-Total'] = df.drop(statfields).sum(axis = 1) except: df['temp-Total'] = df.sum(axis = 1) df = df.T return df def resort(df, sort_by = False, keep_stats = False): """sort results, potentially using scipy's linregress""" # translate options and make sure they are parseable options = ['total', 'name', 'infreq', 'increase', 'turbulent', 'decrease', 'static', 'most', 'least', 'none'] if sort_by is True: sort_by = 'total' if sort_by == 'most': sort_by = 'total' if sort_by == 'least': sort_by = 'infreq' if sort_by not in options: raise ValueError("sort_by parameter error: '%s' not recognised. Must be True, False, %s" % (sort_by, ', '.join(options))) if operation.startswith('k'): if type(df) == pandas.core.series.Series: if sort_by == 'total': df = df.order(ascending = False) elif sort_by == 'infreq': df = df.order(ascending = True) elif sort_by == 'name': df = df.sort_index() return df if just_totals: if sort_by == 'infreq': df = df.sort(columns = 'Combined total', ascending = True) elif sort_by == 'total': df = df.sort(columns = 'Combined total', ascending = False) elif sort_by == 'name': df = df.sort_index() return df # this is really shitty now that i know how to sort, like in the above if keep_stats: df = do_stats(df) if type(df) == bool: if df is False: return False if sort_by == 'total': if df1_istotals: df = df.T df = recalc(df, operation = operation) tot = df.ix['temp-Total'] df = df[tot.argsort()[::-1]] df = df.drop('temp-Total', axis = 0) df = df.drop('temp-Total', axis = 1) if df1_istotals: df = df.T elif sort_by == 'infreq': if df1_istotals: df = df.T df = recalc(df, operation = operation) tot = df.ix['temp-Total'] df = df[tot.argsort()] df = df.drop('temp-Total', axis = 0) df = df.drop('temp-Total', axis = 1) if df1_istotals: df = df.T elif sort_by == 'name': # currently case sensitive... df = df.reindex_axis(sorted(df.columns), axis=1) else: statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] if not keep_stats: df = do_stats(df) if type(df) == bool: if df is False: return False slopes = df.ix['slope'] if sort_by == 'increase': df = df[slopes.argsort()[::-1]] elif sort_by == 'decrease': df = df[slopes.argsort()] elif sort_by == 'static': df = df[slopes.abs().argsort()] elif sort_by == 'turbulent': df = df[slopes.abs().argsort()[::-1]] if remove_above_p: # the easy way to do it! df = df.T df = df[df['p'] <= p] df = df.T # remove stats field by default if not keep_stats: df = df.drop(statfields, axis = 0) return df def set_threshold(big_list, threshold, prinf = True, for_keywords = False): if type(threshold) == str: if threshold.startswith('l'): denominator = 10000 if threshold.startswith('m'): denominator = 5000 if threshold.startswith('h'): denominator = 2500 if type(big_list) == pandas.core.frame.DataFrame: tot = big_list.sum().sum() if type(big_list) == pandas.core.series.Series: tot = big_list.sum() the_threshold = float(tot) / float(denominator) #if for_keywords: #the_threshold = the_threshold / 2 else: the_threshold = threshold if prinf: print 'Threshold: %d\n' % the_threshold return the_threshold # check if we're in concordance mode try: if list(dataframe1.columns) == ['l', 'm', 'r']: conc_lines = True else: conc_lines = False except: conc_lines = False # copy dataframe to be very safe try: df = dataframe1.copy() except AttributeError: no_good_dataframe1 = True while no_good_dataframe1: if 'interrogation' in str(type(dataframe1)): sel = raw_input("\nIt looks like you're trying to edit an interrogation, " \ "rather than an interrogation's .results or .totals branch. You can:\n\n a) select .results branch\n b) select .totals branch\n c) exit\n\nYour choice: ") if sel.startswith('a'): try: dataframe1 = dataframe1.results no_good_dataframe1 = False except: pass elif sel.startswith('b'): try: dataframe1 = dataframe1.totals no_good_dataframe1 = False except: pass else: return else: raise ValueError("Thing to be edited (dataframe1) needs to be a Pandas DataFrame or Series. " \ "Right now, its type is: '%s'." % type(dataframe1).__name__) df = dataframe1.copy() # make cols into strings try: df.columns = [str(c) for c in list(df.columns)] except: pass if operation is None: operation = 'None' # do concordance work if conc_lines: df = dataframe1.copy() if just_entries: if type(just_entries) == int: just_entries = [just_entries] if type(just_entries) == str: df = df[df['m'].str.contains(just_entries)] if type(just_entries) == list: if type(just_entries[0]) == str: regex = re.compile(r'(?i)^(' + r'|'.join(just_entries) + r')$') df = df[df['m'].str.contains(regex)] else: df = df.ix[just_entries].reset_index(drop = True) if skip_entries: if type(skip_entries) == int: skip_entries = [skip_entries] if type(skip_entries) == str: df = df[~df['m'].str.contains(skip_entries)] if type(skip_entries) == list: if type(skip_entries[0]) == str: regex = re.compile(r'(?i)^(' + r'|'.join(skip_entries) + r')$') df = df[~df['m'].str.contains(regex)] else: df = df.ix[[e for e in list(df.index) if e not in skip_entries]].reset_index(drop = True) return df if print_info: print '\n***Processing results***\n========================\n' df1_istotals = False if type(df) == pandas.core.series.Series: df1_istotals = True df = pandas.DataFrame(df) # if just a single result else: df = pandas.DataFrame(df) if operation.startswith('k'): if sort_by is False: if not df1_istotals: sort_by = 'turbulent' if df1_istotals: df = df.T # figure out if there's a second list # copy and remove totals if there is single_totals = True using_totals = False outputmode = False try: if dataframe2.empty is False: df2 = dataframe2.copy() using_totals = True if type(df2) == pandas.core.frame.DataFrame: if len(df2.columns) > 1: single_totals = False else: df2 = pd.Series(df2) if operation == 'd': df2 = df2.sum(axis = 1) single_totals = True elif type(df2) == pandas.core.series.Series: single_totals = True #if operation == 'k': #raise ValueError('Keywording requires a DataFrame for dataframe2. Use "self"?') else: raise ValueError('dataframe2 not recognised.') except AttributeError: if operation in ['k', 'd', 'a', '%', '/', '*', '-', '+']: dataframe2 = 'self' if dataframe2 == 'self': outputmode = True if operation.startswith('a') or operation.startswith('A'): if list(df.columns)[0] != '0' and list(df.columns)[0] != 0: df = df.T if using_totals: if not single_totals: df2 = df2.T if projection: # projection shouldn't do anything when working with '%', remember. df = projector(df, projection) if using_totals: df2 = projector(df2, projection) if spelling: df = convert_spell(df, convert_to = spelling) df = merge_duplicates(df, print_info = False) if not single_totals: df2 = convert_spell(df2, convert_to = spelling, print_info = False) df2 = merge_duplicates(df2, print_info = False) if not df1_istotals: sort_by = 'total' if replace_names: df = name_replacer(df, replace_names) df = merge_duplicates(df) if not single_totals: df2 = name_replacer(df2, print_info = False) df2 = merge_duplicates(df2, print_info = False) if not sort_by: sort_by = 'total' # remove old stats if they're there: statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: df = df.drop(statfields, axis = 0) except: pass if using_totals: try: df2 = df2.drop(statfields, axis = 0) except: pass # remove totals and tkinter order for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): try: df = df.drop(name, axis = ax, errors = 'ignore') except: pass for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): try: df2 = df2.drop(name, axis = ax, errors = 'ignore') except: pass # merging: make dicts if they aren't already, so we can iterate if merge_entries: if type(merge_entries) != list: if type(merge_entries) == str or type(merge_entries) == unicode: merge_entries = {newname: merge_entries} # for newname, criteria for name, the_input in sorted(merge_entries.items()): the_newname = newname_getter(df, parse_input(df, the_input), newname = name, prinf = print_info) df = merge_these_entries(df, parse_input(df, the_input), the_newname, prinf = print_info) if not single_totals: df2 = merge_these_entries(df2, parse_input(df2, the_input), the_newname, prinf = False) else: for i in merge_entries: the_newname = newname_getter(df, parse_input(df, merge_entries), newname = newname, prinf = print_info) df = merge_these_entries(df, parse_input(df, merge_entries), the_newname, prinf = print_info) if not single_totals: df2 = merge_these_entries(df2, parse_input(df2, merge_entries), the_newname, prinf = False) if merge_subcorpora: if type(merge_subcorpora) != dict: if type(merge_subcorpora) == list: if type(merge_subcorpora[0]) == tuple: merge_subcorpora = {x: y for x, y in merge_subcorpora} elif type(merge_subcorpora[0]) == str or type(merge_subcorpora[0]) == unicode: merge_subcorpora = {new_subcorpus_name: [x for x in merge_subcorpora]} elif type(merge_subcorpora[0]) == int: merge_subcorpora = {new_subcorpus_name: [str(x) for x in merge_subcorpora]} else: merge_subcorpora = {new_subcorpus_name: merge_subcorpora} for name, the_input in sorted(merge_subcorpora.items()): the_newname = newname_getter(df.T, parse_input(df.T, the_input), newname = name, merging_subcorpora = True, prinf = print_info) df = merge_these_entries(df.T, parse_input(df.T, the_input), the_newname, merging = 'subcorpora', prinf = print_info).T if using_totals: df2 = merge_these_entries(df2.T, parse_input(df2.T, the_input), the_newname, merging = 'subcorpora', prinf = False).T if just_subcorpora: df = just_these_subcorpora(df, just_subcorpora, prinf = print_info) if using_totals: df2 = just_these_subcorpora(df2, just_subcorpora, prinf = False) if skip_subcorpora: df = skip_these_subcorpora(df, skip_subcorpora, prinf = print_info) if using_totals: df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf = False) if span_subcorpora: df = span_these_subcorpora(df, span_subcorpora, prinf = print_info) if using_totals: df2 = span_these_subcorpora(df2, span_subcorpora, prinf = False) if just_entries: df = just_these_entries(df, parse_input(df, just_entries), prinf = print_info) if not single_totals: df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf = False) if skip_entries: df = skip_these_entries(df, parse_input(df, skip_entries), prinf = print_info) if not single_totals: df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf = False) # drop infinites and nans if operation != 'd': df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) # make just_totals as dataframe just_one_total_number = False if just_totals: df = pd.DataFrame(df.sum(), columns = ['Combined total']) if using_totals: if not single_totals: df2 = pd.DataFrame(df2.sum(), columns = ['Combined total']) else: just_one_total_number = True df2 = df2.sum() tots = df.sum(axis = 1) if using_totals or outputmode: if not operation.startswith('k'): the_threshold = 0 # set a threshold if just_totals if outputmode is True: df2 = df.T.sum() if not just_totals: df2.name = 'Total' else: df2.name = 'Combined total' using_totals = True single_totals = True if just_totals: if not single_totals: the_threshold = set_threshold(df2, threshold, prinf = print_info) if operation == 'd': the_threshold = set_threshold(df2, threshold, prinf = print_info) df, tots = combiney(df, df2, operation = operation, threshold = the_threshold, prinf = print_info) # if doing keywording... if operation.startswith('k'): from keys import keywords # allow saved dicts to be df2, etc try: if dataframe2 == 'self': df2 = df.copy() except TypeError: pass if type(dataframe2) == str: if dataframe2 != 'self': df2 = dataframe2 else: the_threshold = False df = keywords(df, df2, selfdrop = selfdrop, threshold = threshold, printstatus = print_info, editing = True, calc_all = calc_all, **kwargs) # eh? df = df.T # drop infinites and nans if operation != 'd': df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) # resort data if sort_by: df = resort(df, keep_stats = keep_stats, sort_by = sort_by) if type(df) == bool: if df is False: return 'linregress' if keep_top: if not just_totals: df = df[list(df.columns)[:keep_top]] else: df = df.head(keep_top) if just_totals: # turn just_totals into series: df = pd.Series(df['Combined total'], name = 'Combined total') if df1_istotals: if operation.startswith('k'): try: df = pd.Series(df.ix[dataframe1.name]) df.name = '%s: keyness' % df.name except: df = df.iloc[0,:] df.name = 'keyness' % df.name # generate totals branch if not percentage results: # fix me if df1_istotals or operation.startswith('k'): if not just_totals: try: total = pd.Series(df['Total'], name = 'Total') except: pass total = 'none' #total = df.copy() else: total = 'none' else: # might be wrong if using division or something... try: total = df.T.sum(axis = 1) except: total = 'none' if type(tots) != pandas.core.frame.DataFrame and type(tots) != pandas.core.series.Series: total = df.sum(axis = 1) else: total = tots if type(df) == pandas.core.frame.DataFrame: datatype = df.ix[0].dtype else: datatype = df.dtype # TURN INT COL NAMES INTO STR try: df.results.columns = [str(d) for d in list(df.results.columns)] except: pass def add_tkt_index(df): if type(df) != pandas.core.series.Series: df = df.T df = df.drop('tkintertable-order', errors = 'ignore', axis = 0) df = df.drop('tkintertable-order', errors = 'ignore', axis = 1) df['tkintertable-order'] = pd.Series([index for index, data in enumerate(list(df.index))], index = list(df.index)) df = df.T return df # while tkintertable can't sort rows from corpkit.tests import check_t_kinter tk = check_t_kinter() if tk: df = add_tkt_index(df) if 'df1_always_df' in kwargs.keys(): if kwargs['df1_always_df'] is True: if type(df) == pandas.core.series.Series: df = pandas.DataFrame(df) #make named_tuple the_operation = 'none' if using_totals: the_operation = operation the_options = {} the_options['time_started'] = the_time_started the_options['function'] = 'editor' the_options['dataframe1'] = dataframe1 the_options['operation'] = the_operation the_options['dataframe2'] = dataframe2 the_options['datatype'] = datatype the_options['sort_by'] = sort_by the_options['keep_stats'] = keep_stats the_options['just_totals'] = just_totals the_options['threshold'] = threshold # can be wrong! the_options['just_entries'] = just_entries the_options['just_entries'] = just_entries the_options['skip_entries'] = skip_entries the_options['merge_entries'] = merge_entries the_options['newname'] = newname the_options['just_subcorpora'] = just_subcorpora the_options['skip_subcorpora'] = skip_subcorpora the_options['span_subcorpora'] = span_subcorpora the_options['merge_subcorpora'] = merge_subcorpora the_options['new_subcorpus_name'] = new_subcorpus_name the_options['projection'] = projection the_options['remove_above_p'] = remove_above_p the_options['p'] = p the_options['revert_year'] = revert_year the_options['print_info'] = print_info outputnames = collections.namedtuple('edited_interrogation', ['query', 'results', 'totals']) output = outputnames(the_options, df, total) #print '\nResult (sample)\n' if print_info: #if merge_entries or merge_subcorpora or span_subcorpora or just_subcorpora or \ #just_entries or skip_entries or skip_subcorpora or printed_th or projection: print '***Done!***\n========================\n' #print df.head().T #print '' if operation.startswith('k') or just_totals or df1_istotals: pd.set_option('display.max_rows', 30) else: pd.set_option('display.max_rows', 15) pd.set_option('display.max_columns', 8) pd.set_option('max_colwidth',70) pd.set_option('display.width', 800) pd.set_option('expand_frame_repr', False) pd.set_option('display.float_format', lambda x: '%.2f' % x) return output
def plotter(title, df, x_label = None, y_label = None, style = 'ggplot', figsize = (8, 4), save = False, legend_pos = 'best', reverse_legend = 'guess', num_to_plot = 7, tex = 'try', colours = 'Paired', cumulative = False, pie_legend = True, partial_pie = False, show_totals = False, transparent = False, output_format = 'png', interactive = False, black_and_white = False, show_p_val = False, indices = 'guess', **kwargs): """plot interrogator() or editor() output. **kwargs are for pandas first, which can then send them through to matplotlib.plot(): http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.plot.html http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot pie_legend: False to label slices rather than give legend show_totals: where to show percent/abs frequencies: False, 'plot', 'legend', or 'both' """ import corpkit import os import matplotlib as mpl if interactive: import matplotlib.pyplot as plt, mpld3 else: import matplotlib.pyplot as plt from matplotlib import rc import pandas import pandas as pd from pandas import DataFrame import numpy from time import localtime, strftime from corpkit.tests import check_pytex, check_spider, check_t_kinter if interactive: import mpld3 import collections from mpld3 import plugins, utils from plugins import InteractiveLegendPlugin, HighlightLines tk = check_t_kinter() running_python_tex = check_pytex() # incorrect spelling of spider on purpose running_spider = check_spider() def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100): """remove extreme values from colourmap --- no pure white""" import matplotlib.colors as colors import numpy as np new_cmap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))) return new_cmap def get_savename(imagefolder, save = False, title = False, ext = 'png'): """Come up with the savename for the image.""" import os def urlify(s): "Turn title into filename" import re s = s.lower() s = re.sub(r"[^\w\s-]", '', s) s = re.sub(r"\s+", '-', s) s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s) return s # name as if not ext.startswith('.'): ext = '.' + ext if type(save) == str: savename = os.path.join(imagefolder, (urlify(save) + ext)) #this 'else' is redundant now that title is obligatory else: if title: filename = urlify(title) + ext savename = os.path.join(imagefolder, filename) # remove duplicated ext if savename.endswith('%s%s' % (ext, ext)): savename = savename.replace('%s%s' % (ext, ext), ext, 1) return savename def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True): """adds totals (abs, rel, keyness) to entry name strings""" if was_series: where_the_words_are = dataframe.index else: where_the_words_are = dataframe.columns the_labs = [] for w in list(where_the_words_are): if not absolutes: if was_series: perc = dataframe.T[w][0] else: the_labs.append(w) continue if using_tex: the_labs.append('%s (%.2f\%%)' % (w, perc)) else: the_labs.append('%s (%.2f %%)' % (w, perc)) else: if was_series: score = dataframe.T[w].sum() else: score = dataframe[w].sum() if using_tex: the_labs.append('%s (n=%d)' % (w, score)) else: the_labs.append('%s (n=%d)' % (w, score)) if not was_series: dataframe.columns = the_labs else: vals = list(dataframe[list(dataframe.columns)[0]].values) dataframe = pd.DataFrame(vals, index = the_labs) dataframe.columns = ['Total'] return dataframe def auto_explode(dataframe, input, was_series = False, num_to_plot = 7): """give me a list of strings and i'll output explode option""" output = [0 for s in range(num_to_plot)] if was_series: l = list(dataframe.index) else: l = list(dataframe.columns) if type(input) == str or type(input) == int: input = [input] if type(input) == list: for i in input: if type(i) == str: index = l.index(i) else: index = i output[index] = 0.1 return output # are we doing subplots? sbplt = False if 'subplots' in kwargs: if kwargs['subplots'] is True: sbplt = True if colours is True: colours = 'Paired' styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight'] if style not in styles: raise ValueError('Style %s not found. Use %s' % (style, ', '.join(styles))) if 'savepath' in kwargs.keys(): mpl.rcParams['savefig.directory'] = kwargs['savepath'] del kwargs['savepath'] mpl.rcParams['savefig.bbox'] = 'tight' # try to use tex # TO DO: # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.latex.unicode'] = True if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if interactive: using_tex = False if show_totals is False: show_totals = 'none' # find out what kind of plot we're making, and enable # or disable interactive values if need be if 'kind' not in kwargs: kwargs['kind'] = 'line' if interactive: if kwargs['kind'].startswith('bar'): interactive_types = [3] elif kwargs['kind'] == 'area': interactive_types = [2, 3] elif kwargs['kind'] == 'line': interactive_types = [2, 3] elif kwargs['kind'] == 'pie': interactive_types = None warnings.warn('Interactive plotting not yet available for pie plots.') else: interactive_types = [None] if interactive is False: interactive_types = [None] # find out if pie mode, add autopct format piemode = False if 'kind' in kwargs: if kwargs['kind'] == 'pie': piemode = True # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' #if piemode: #if partial_pie: #kwargs['startangle'] = 180 kwargs['subplots'] = sbplt # copy data, make series into df dataframe = df.copy() was_series = False if type(dataframe) == pandas.core.series.Series: was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # attempt to convert x axis to ints: try: dataframe.index = [int(i) for i in list(dataframe.index)] except: pass # remove totals and tkinter order if not was_series: for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): dataframe = dataframe.drop(name, axis = ax, errors = 'ignore') else: dataframe = dataframe.drop('tkintertable-order', errors = 'ignore') dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore') # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': def isint(x): try: a = float(x) b = int(a) except ValueError or OverflowError: return False else: return a == b if all([isint(x) is True for x in list(dataframe.columns)]): indices = True else: indices = False # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf'] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if 'explode' in kwargs: if not piemode: del kwargs['explode'] if piemode: if 'explode' in kwargs: if not sbplt: kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series = was_series, num_to_plot = num_to_plot) if 'legend' in kwargs: legend = kwargs['legend'] else: legend = True #cut data short plotting_a_totals_column = False if was_series: if list(dataframe.columns)[0] != 'Total': try: can_be_ints = [int(x) for x in list(dataframe.index)] num_to_plot = len(dataframe) except: dataframe = dataframe[:num_to_plot] elif list(dataframe.columns)[0] == 'Total': plotting_a_totals_column = True if not 'legend' in kwargs: legend = False num_to_plot = len(dataframe) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis = 1) except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val: if there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] newname = '%s (p=%s)' % (col, format(pval, '.5f')) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis = 0, inplace = True) else: warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.') else: if there_are_p_vals: dataframe.drop(statfields, axis = 0, inplace = True) # make and set y label absolutes = True if type(dataframe) == pandas.core.frame.DataFrame: try: if not all([s.is_integer() for s in dataframe.iloc[0,:].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False # use colormap if need be: if num_to_plot > 0: if not was_series: if 'kind' in kwargs: if kwargs['kind'] in ['pie', 'line', 'area']: if colours: if not plotting_a_totals_column: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours #else: if colours: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours if piemode: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours else: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours #else: #if len(dataframe.T.columns) < 8: #try: #del kwargs['colormap'] #except: #pass # multicoloured bar charts if 'kind' in kwargs: if colours: if kwargs['kind'].startswith('bar'): if len(list(dataframe.columns)) == 1: if not black_and_white: import numpy as np the_range = np.linspace(0, 1, num_to_plot) cmap = plt.get_cmap(colours) kwargs['colors'] = [cmap(n) for n in the_range] # make a bar width ... ? #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5 # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if 'kind' in kwargs: if kwargs['kind'] in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kwargs['kind'] == 'pie': if pie_legend: legend = True else: legend = False if kwargs['kind'] in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # cut dataframe if just_totals try: tst = dataframe['Combined total'] dataframe = dataframe.head(num_to_plot) except: pass # rotate automatically if 'rot' not in kwargs: if not was_series: xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]] #if 'kind' in kwargs: #if kwargs['kind'] in ['barh', 'area']: #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] else: xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] if len(max(xvals, key=len)) > 6: if not piemode: kwargs['rot'] = 45 # no title for subplots because ugly, if sbplt: if 'title' in kwargs: del kwargs['title'] else: kwargs['title'] = title # no interactive subplots yet: if sbplt and interactive: import warnings interactive = False warnings.warn('No interactive subplots yet, sorry.') return # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: # kwarg options go in leg_options leg_options = {'framealpha': .8} if 'shadow' in kwargs: leg_options['shadow'] = True if 'ncol' in kwargs: leg_options['ncol'] = kwargs['ncol'] del kwargs['ncol'] else: if num_to_plot > 6: leg_options['ncol'] = num_to_plot / 7 # determine legend position based on this dict if legend_pos: possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left'} if type(legend_pos) == int: the_loc = legend_pos elif type(legend_pos) == str: try: the_loc = possible[legend_pos] except KeyError: raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(possible.keys())) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if type(legend_pos) == str: if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) if piemode: if partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode: if not sbplt: kwargs['y'] = list(dataframe.columns)[0] if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.columns) else: if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.index) areamode = False if 'kind' in kwargs: if kwargs['kind'] == 'area': areamode = True if legend is False: kwargs['legend'] = False # cumulative grab first col if cumulative: kwargs['y'] = list(dataframe.columns)[0] # line highlighting option for interactive! if interactive: if 2 in interactive_types: if kwargs['kind'] == 'line': kwargs['marker'] = ',' if not piemode: kwargs['alpha'] = 0.1 # convert dates --- works only in my current case! if plotting_a_totals_column or not was_series: try: can_it_be_int = int(list(dataframe.index)[0]) can_be_int = True except: can_be_int = False if can_be_int: if 1500 < int(list(dataframe.index)[0]): if 2050 > int(list(dataframe.index)[0]): n = pd.PeriodIndex([d for d in list(dataframe.index)], freq='A') dataframe = dataframe.set_index(n) MARKERSIZE = 4 COLORMAP = { 0: {'marker': None, 'dash': (None,None)}, 1: {'marker': None, 'dash': [5,5]}, 2: {'marker': "o", 'dash': (None,None)}, 3: {'marker': None, 'dash': [1,3]}, 4: {'marker': "s", 'dash': [5,2,5,2,5,10]}, 5: {'marker': None, 'dash': [5,3,1,2,1,10]}, 6: {'marker': 'o', 'dash': (None,None)}, 7: {'marker': None, 'dash': [5,3,1,3]}, 8: {'marker': "1", 'dash': [1,3]}, 9: {'marker': "*", 'dash': [5,5]}, 10: {'marker': "2", 'dash': [5,2,5,2,5,10]}, 11: {'marker': "s", 'dash': (None,None)} } HATCHES = { 0: {'color': '#dfdfdf', 'hatch':"/"}, 1: {'color': '#6f6f6f', 'hatch':"\\"}, 2: {'color': 'b', 'hatch':"|"}, 3: {'color': '#dfdfdf', 'hatch':"-"}, 4: {'color': '#6f6f6f', 'hatch':"+"}, 5: {'color': 'b', 'hatch':"x"} } if black_and_white: if kwargs['kind'] == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kwargs['kind'] == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs['colormap'] = new_cmap # use styles and plot with plt.style.context((style)): if not sbplt: # check if negative values, no stacked if so if areamode: if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False ax = dataframe.plot(figsize = figsize, **kwargs) else: if not piemode and not sbplt: ax = dataframe.plot(figsize = figsize, **kwargs) else: ax = dataframe.plot(figsize = figsize, **kwargs) handles, labels = plt.gca().get_legend_handles_labels() plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1), bbox_transform = plt.gcf().transFigure ) if not tk: plt.show() return if 'rot' in kwargs: if kwargs['rot'] != 0 and kwargs['rot'] != 90: labels = [item.get_text() for item in ax.get_xticklabels()] ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white: #plt.grid() plt.gca().set_axis_bgcolor('w') if kwargs['kind'] == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(COLORMAP.keys()): c = 0 if legend: if not piemode and not sbplt: if 3 not in interactive_types: if not rev_leg: lgd = plt.legend(**leg_options) else: handles, labels = plt.gca().get_legend_handles_labels() lgd = plt.legend(handles[::-1], labels[::-1], **leg_options) #if black_and_white: #lgd.set_facecolor('w') #if interactive: #if legend: #lgd.set_title("") #if not sbplt: #if 'layout' not in kwargs: #plt.tight_layout() if interactive: # 1 = highlight lines # 2 = line labels # 3 = legend switches ax = plt.gca() # fails for piemode lines = ax.lines handles, labels = plt.gca().get_legend_handles_labels() if 1 in interactive_types: plugins.connect(plt.gcf(), HighlightLines(lines)) if 3 in interactive_types: plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0)) for i, l in enumerate(lines): y_vals = l.get_ydata() x_vals = l.get_xdata() x_vals = [str(x) for x in x_vals] if absolutes: ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] else: ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] if 2 in interactive_types: #if 'kind' in kwargs and kwargs['kind'] == 'area': tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i]) mpld3.plugins.connect(plt.gcf(), tooltip_line) #else: if kwargs['kind'] == 'line': tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls) mpld3.plugins.connect(plt.gcf(), tooltip_point) # works: #plugins.connect(plt.gcf(), plugins.LineLabelTooltip(l, labels[i])) #labels = ["Point {0}".format(i) for i in range(num_to_plot)] #tooltip = plugins.LineLabelTooltip(lines) #mpld3.plugins.connect(plt.gcf(), mpld3.plugins.PointLabelTooltip(lines)) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' if x_label is not False: if type(x_label) == str: plt.xlabel(x_label) else: check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count) try: if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' check_x_axis = int(check_x_axis) if 1500 < check_x_axis < 2050: x_label = 'Year' else: x_label = 'Group' except: x_label = 'Group' if not sbplt: if not piemode: plt.xlabel(x_label) # no offsets for numerical x and y values if type(dataframe.index) != pandas.tseries.period.PeriodIndex: try: # check if x axis can be an int check_x_axis = list(dataframe.index)[0] can_it_be_int = int(check_x_axis) # if so, set these things from matplotlib.ticker import ScalarFormatter plt.gca().xaxis.set_major_formatter(ScalarFormatter()) except: pass # same for y axis try: # check if x axis can be an int check_y_axis = list(dataframe.columns)[0] can_it_be_int = int(check_y_axis) # if so, set these things from matplotlib.ticker import ScalarFormatter plt.gca().yaxis.set_major_formatter(ScalarFormatter()) except: pass # y labelling y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' if y_label is not False: if not sbplt: if not piemode: if type(y_label) == str: plt.ylabel(y_label) else: plt.ylabel(y_l) # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.suptitle(title, fontsize = 16) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: if 'kind' in kwargs: if kwargs['kind'].startswith('bar'): width = ax.containers[0][0].get_width() if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0,the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') #if not running_python_tex: #plt.gcf().show() plt.subplots_adjust(left=0.1) plt.subplots_adjust(bottom=0.18) #if 'layout' not in kwargs: #plt.tight_layout() if save: import os if running_python_tex: imagefolder = '../images' else: imagefolder = 'images' savename = get_savename(imagefolder, save = save, title = title, ext = output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o'): plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format) else: plt.gcf().savefig(savename, dpi=150, format = output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print '\n' + time + ": " + savename + " created." else: raise ValueError("Error making %s." % savename) if not interactive and not running_python_tex and not running_spider and not tk: plt.show() return if running_spider or tk or sbplt: return plt if interactive: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) try: ax.legend_.remove() except: pass return mpld3.display()
def plotter(title, df, kind = 'line', x_label = None, y_label = None, style = 'ggplot', figsize = (8, 4), save = False, legend_pos = 'best', reverse_legend = 'guess', num_to_plot = 7, tex = 'try', colours = 'Accent', cumulative = False, pie_legend = True, partial_pie = False, show_totals = False, transparent = False, output_format = 'png', interactive = False, black_and_white = False, show_p_val = False, indices = False, **kwargs): """Visualise corpus interrogations. :param title: A title for the plot :type title: str :param df: Data to be plotted :type df: pandas.core.frame.DataFrame :param x_label: A label for the x axis :type x_label: str :param y_label: A label for the y axis :type y_label: str :param kind: The kind of chart to make :type kind: str ('line'/'bar'/'barh'/'pie'/'area') :param style: Visual theme of plot :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc) :param figsize: Size of plot :type figsize: tuple (int, int) :param save: If bool, save with *title* as name; if str, use str as name :type save: bool/str :param legend_pos: Where to place legend :type legend_pos: str ('upper right'/'outside right'/etc) :param reverse_legend: Reverse the order of the legend :type reverse_legend: bool :param num_to_plot: How many columns to plot :type num_to_plot: int/'all' :param tex: Use TeX to draw plot text :type tex: bool :param colours: Colourmap for lines/bars/slices :type colours: str :param cumulative: Plot values cumulatively :type cumulative: bool :param pie_legend: Show a legend for pie chart :type pie_legend: bool :param partial_pie: Allow plotting of pie slices only :type partial_pie: bool :param show_totals: Print sums in plot where possible :type show_totals: str -- 'legend'/'plot'/'both' :param transparent: Transparent .png background :type transparent: bool :param output_format: File format for saved image :type output_format: str -- 'png'/'pdf' :param black_and_white: Create black and white line styles :type black_and_white: bool :param show_p_val: Attempt to print p values in legend if contained in df :type show_p_val: bool :param indices: To use when plotting "distance from root" :type indices: bool :param stacked: When making bar chart, stack bars on top of one another :type stacked: str :param filled: For area and bar charts, make every column sum to 100 :type filled: str :param legend: Show a legend :type legend: bool :param rot: Rotate x axis ticks by *rot* degrees :type rot: int :param subplots: Plot each column separately :type subplots: bool :param layout: Grid shape to use when *subplots* is True :type layout: tuple -- (int, int) :param interactive: Experimental interactive options :type interactive: list -- [1, 2, 3] :returns: matplotlib figure """ import corpkit import os try: from IPython.utils.shimmodule import ShimWarning import warnings warnings.simplefilter('ignore', ShimWarning) except: pass import matplotlib as mpl from matplotlib import rc # prefer seaborn plotting try: import seaborn as sns except: pass if interactive: import matplotlib.pyplot as plt, mpld3 else: import matplotlib.pyplot as plt import pandas from pandas import DataFrame import numpy from time import localtime, strftime from corpkit.tests import check_pytex, check_spider, check_t_kinter if interactive: import mpld3 import collections from mpld3 import plugins, utils from plugins import InteractiveLegendPlugin, HighlightLines # check what environment we're in tk = check_t_kinter() running_python_tex = check_pytex() running_spider = check_spider() def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100): """remove extreme values from colourmap --- no pure white""" import matplotlib.colors as colors import numpy as np new_cmap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))) return new_cmap def get_savename(imagefolder, save = False, title = False, ext = 'png'): """Come up with the savename for the image.""" import os def urlify(s): "Turn title into filename" import re s = s.lower() s = re.sub(r"[^\w\s-]", '', s) s = re.sub(r"\s+", '-', s) s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s) return s # name as if not ext.startswith('.'): ext = '.' + ext if type(save) == str: savename = os.path.join(imagefolder, (urlify(save) + ext)) #this 'else' is redundant now that title is obligatory else: if title: filename = urlify(title) + ext savename = os.path.join(imagefolder, filename) # remove duplicated ext if savename.endswith('%s%s' % (ext, ext)): savename = savename.replace('%s%s' % (ext, ext), ext, 1) return savename def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True): """adds totals (abs, rel, keyness) to entry name strings""" if was_series: where_the_words_are = dataframe.index else: where_the_words_are = dataframe.columns the_labs = [] for w in list(where_the_words_are): if not absolutes: if was_series: perc = dataframe.T[w][0] else: the_labs.append(w) continue if using_tex: the_labs.append('%s (%.2f\%%)' % (w, perc)) else: the_labs.append('%s (%.2f %%)' % (w, perc)) else: if was_series: score = dataframe.T[w].sum() else: score = dataframe[w].sum() if using_tex: the_labs.append('%s (n=%d)' % (w, score)) else: the_labs.append('%s (n=%d)' % (w, score)) if not was_series: dataframe.columns = the_labs else: vals = list(dataframe[list(dataframe.columns)[0]].values) dataframe = pandas.DataFrame(vals, index = the_labs) dataframe.columns = ['Total'] return dataframe def auto_explode(dataframe, input, was_series = False, num_to_plot = 7): """give me a list of strings and i'll output explode option""" output = [0 for s in range(num_to_plot)] if was_series: l = list(dataframe.index) else: l = list(dataframe.columns) if type(input) == str or type(input) == int: input = [input] if type(input) == list: for i in input: if type(i) == str: index = l.index(i) else: index = i output[index] = 0.1 return output # check if we're doing subplots sbplt = False if 'subplots' in kwargs: if kwargs['subplots'] is True: sbplt = True kwargs['subplots'] = sbplt if colours is True: colours = 'Paired' # todo: get this dynamically instead. styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white'] #if style not in styles: #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles))) if style == 'mpl-white': try: sns.set_style("whitegrid") except: pass style = 'matplotlib' if style is not False and style.startswith('seaborn'): colours = False # use 'draggable = True' to make a draggable legend dragmode = kwargs.get('draggable', False) kwargs.pop('draggable', None) if kwargs.get('savepath'): mpl.rcParams['savefig.directory'] = kwargs.get('savepath') kwargs.pop('savepath', None) mpl.rcParams['savefig.bbox'] = 'tight' mpl.rcParams.update({'figure.autolayout': True}) # try to use tex # TO DO: # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.latex.unicode'] = True if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if interactive: using_tex = False if show_totals is False: show_totals = 'none' # find out what kind of plot we're making, and enable # or disable interactive values if need be kwargs['kind'] = kind.lower() if interactive: if kwargs['kind'].startswith('bar'): interactive_types = [3] elif kwargs['kind'] == 'area': interactive_types = [2, 3] elif kwargs['kind'] == 'line': interactive_types = [2, 3] elif kwargs['kind'] == 'pie': interactive_types = None warnings.warn('Interactive plotting not yet available for pie plots.') else: interactive_types = [None] if interactive is False: interactive_types = [None] # find out if pie mode, add autopct format piemode = False if kind == 'pie': piemode = True # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' # copy data, make series into df dataframe = df.copy() was_series = False if type(dataframe) == pandas.core.series.Series: was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # attempt to convert x axis to ints: try: dataframe.index = [int(i) for i in list(dataframe.index)] except: pass # remove totals and tkinter order if not was_series and not all(x.lower() == 'total' for x in list(dataframe.columns)): for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): try: dataframe = dataframe.drop(name, axis = ax, errors = 'ignore') except: pass else: dataframe = dataframe.drop('tkintertable-order', errors = 'ignore') dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore') # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': def isint(x): try: a = float(x) b = int(a) except ValueError or OverflowError: return False else: return a == b if all([isint(x) is True for x in list(dataframe.columns)]): indices = True else: indices = False # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf'] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if piemode and not sbplt and kwargs.get('explode'): kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series = was_series, num_to_plot = num_to_plot) else: kwargs.pop('explode', None) legend = kwargs.get('legend', False) #cut data short plotting_a_totals_column = False if was_series: if list(dataframe.columns)[0] != 'Total': try: can_be_ints = [int(x) for x in list(dataframe.index)] num_to_plot = len(dataframe) except: dataframe = dataframe[:num_to_plot] elif list(dataframe.columns)[0] == 'Total': plotting_a_totals_column = True if not 'legend' in kwargs: legend = False num_to_plot = len(dataframe) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis = 1, errors = 'ignore') except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val: if there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] def p_string_formatter(val): if val < 0.001: if not using_tex: return 'p < 0.001' else: return r'p $<$ 0.001' else: return 'p = %s' % format(val, '.3f') pstr = p_string_formatter(pval) newname = '%s (%s)' % (col, pstr) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore') else: warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.') else: if there_are_p_vals: dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore') # make and set y label absolutes = True if type(dataframe) == pandas.core.frame.DataFrame: try: if not all([s.is_integer() for s in dataframe.iloc[0,:].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False # use colormap if need be: if num_to_plot > 0: if not was_series: if kind in ['pie', 'line', 'area']: if colours: if not plotting_a_totals_column: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours #else: if colours: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours if piemode: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours else: if num_to_plot > 0: if colours == 'Default': colours = 'Paired' kwargs['colormap'] = colours # multicoloured bar charts if colours: if kind.startswith('bar'): if len(list(dataframe.columns)) == 1: if not black_and_white: import numpy as np the_range = np.linspace(0, 1, num_to_plot) cmap = plt.get_cmap(colours) kwargs['colors'] = [cmap(n) for n in the_range] # make a bar width ... ? ... #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5 # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if kind in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kind == 'pie': if pie_legend: legend = True else: legend = False if kind in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # cut dataframe if just_totals try: tst = dataframe['Combined total'] dataframe = dataframe.head(num_to_plot) except: pass # rotate automatically if 'rot' not in kwargs: if not was_series: xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]] #if 'kind' in kwargs: #if kwargs['kind'] in ['barh', 'area']: #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] else: xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]] if len(max(xvals, key=len)) > 6: if not piemode: kwargs['rot'] = 45 # no title for subplots because ugly, if title and not sbplt: kwargs['title'] = title # no interactive subplots yet: if sbplt and interactive: import warnings interactive = False warnings.warn('No interactive subplots yet, sorry.') return # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: if num_to_plot > 6: if not kwargs.get('ncol'): kwargs['ncol'] = num_to_plot / 7 # kwarg options go in leg_options leg_options = {'framealpha': .8, 'shadow': kwargs.get('shadow', False), 'ncol': kwargs.pop('ncol', 1)} # determine legend position based on this dict if legend_pos: possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left'} if type(legend_pos) == int: the_loc = legend_pos elif type(legend_pos) == str: try: the_loc = possible[legend_pos] except KeyError: raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(possible.keys())) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if type(legend_pos) == str: if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series = was_series, using_tex = using_tex, absolutes = absolutes) if piemode: if partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode: if not sbplt: kwargs['y'] = list(dataframe.columns)[0] if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.columns) else: if pie_legend: kwargs['legend'] = False if was_series: leg_options['labels'] = list(dataframe.index) else: leg_options['labels'] = list(dataframe.index) def filler(df): pby = df.T.copy() for i in list(pby.columns): tot = pby[i].sum() pby[i] = pby[i] * 100.0 / tot return pby.T areamode = False if kind == 'area': areamode = True if legend is False: kwargs['legend'] = False # line highlighting option for interactive! if interactive: if 2 in interactive_types: if kind == 'line': kwargs['marker'] = ',' if not piemode: kwargs['alpha'] = 0.1 # convert dates --- works only in my current case! if plotting_a_totals_column or not was_series: try: can_it_be_int = int(list(dataframe.index)[0]) can_be_int = True except: can_be_int = False if can_be_int: if 1500 < int(list(dataframe.index)[0]): if 2050 > int(list(dataframe.index)[0]): n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A') dataframe = dataframe.set_index(n) if kwargs.get('filled'): if areamode or kind.startswith('bar'): dataframe = filler(dataframe) kwargs.pop('filled', None) MARKERSIZE = 4 COLORMAP = { 0: {'marker': None, 'dash': (None,None)}, 1: {'marker': None, 'dash': [5,5]}, 2: {'marker': "o", 'dash': (None,None)}, 3: {'marker': None, 'dash': [1,3]}, 4: {'marker': "s", 'dash': [5,2,5,2,5,10]}, 5: {'marker': None, 'dash': [5,3,1,2,1,10]}, 6: {'marker': 'o', 'dash': (None,None)}, 7: {'marker': None, 'dash': [5,3,1,3]}, 8: {'marker': "1", 'dash': [1,3]}, 9: {'marker': "*", 'dash': [5,5]}, 10: {'marker': "2", 'dash': [5,2,5,2,5,10]}, 11: {'marker': "s", 'dash': (None,None)} } HATCHES = { 0: {'color': '#dfdfdf', 'hatch':"/"}, 1: {'color': '#6f6f6f', 'hatch':"\\"}, 2: {'color': 'b', 'hatch':"|"}, 3: {'color': '#dfdfdf', 'hatch':"-"}, 4: {'color': '#6f6f6f', 'hatch':"+"}, 5: {'color': 'b', 'hatch':"x"} } if black_and_white: if kind == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kind == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs['colormap'] = new_cmap class dummy_context_mgr(): """a fake context for plotting without style perhaps made obsolete by 'classic' style in new mpl""" def __enter__(self): return None def __exit__(self, one, two, three): return False with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr(): if not sbplt: # check if negative values, no stacked if so if areamode: kwargs['legend'] = False if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False ax = dataframe.plot(figsize = figsize, **kwargs) if areamode: handles, labels = plt.gca().get_legend_handles_labels() del handles del labels else: plt.gcf().set_tight_layout(False) if not piemode: ax = dataframe.plot(figsize = figsize, **kwargs) else: ax = dataframe.plot(figsize = figsize, **kwargs) handles, labels = plt.gca().get_legend_handles_labels() plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1), bbox_transform = plt.gcf().transFigure ) # this line allows layouts with missing plots # i.e. layout = (5, 2) with only nine plots plt.gcf().set_tight_layout(False) if 'rot' in kwargs: if kwargs['rot'] != 0 and kwargs['rot'] != 90: labels = [item.get_text() for item in ax.get_xticklabels()] ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white: if kind == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(COLORMAP.keys()): c = 0 # draw legend with proper placement etc if legend: if not piemode and not sbplt: if 3 not in interactive_types: handles, labels = plt.gca().get_legend_handles_labels() # area doubles the handles and labels. this removes half: if areamode: handles = handles[-len(handles) / 2:] labels = labels[-len(labels) / 2:] if rev_leg: handles = handles[::-1] labels = labels[::-1] lgd = plt.legend(handles, labels, **leg_options) if interactive: # 1 = highlight lines # 2 = line labels # 3 = legend switches ax = plt.gca() # fails for piemode lines = ax.lines handles, labels = plt.gca().get_legend_handles_labels() if 1 in interactive_types: plugins.connect(plt.gcf(), HighlightLines(lines)) if 3 in interactive_types: plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0)) for i, l in enumerate(lines): y_vals = l.get_ydata() x_vals = l.get_xdata() x_vals = [str(x) for x in x_vals] if absolutes: ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] else: ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)] if 2 in interactive_types: #if 'kind' in kwargs and kwargs['kind'] == 'area': tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i]) mpld3.plugins.connect(plt.gcf(), tooltip_line) #else: if kind == 'line': tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls) mpld3.plugins.connect(plt.gcf(), tooltip_point) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' if x_label is not False: if type(x_label) == str: plt.xlabel(x_label) else: check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count) try: if type(dataframe.index) == pandas.tseries.period.PeriodIndex: x_label = 'Year' check_x_axis = int(check_x_axis) if 1500 < check_x_axis < 2050: x_label = 'Year' else: x_label = 'Group' except: x_label = 'Group' if not sbplt: if not piemode: plt.xlabel(x_label) def is_number(s): """check if str can be can be made into float/int""" try: float(s) # for int, long and float except ValueError: try: complex(s) # for complex except ValueError: return False return True # for now, always turn off sci notation from matplotlib.ticker import ScalarFormatter if type(dataframe.index) != pandas.tseries.period.PeriodIndex: try: if all(is_number(s) for s in list(dataframe.index)): plt.gca().xaxis.set_major_formatter(ScalarFormatter()) except: pass try: if all(is_number(s) for s in list(dataframe.columns)): plt.gca().yaxis.set_major_formatter(ScalarFormatter()) except: pass # y labelling y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' def suplabel(axis,label,label_prop=None, labelpad=5, ha='center',va='center'): ''' Add super ylabel or xlabel to the figure Similar to matplotlib.suptitle axis - string: "x" or "y" label - string label_prop - keyword dictionary for Text labelpad - padding from the axis (default: 5) ha - horizontal alignment (default: "center") va - vertical alignment (default: "center") ''' fig = plt.gcf() xmin = [] ymin = [] for ax in fig.axes: xmin.append(ax.get_position().xmin) ymin.append(ax.get_position().ymin) xmin,ymin = min(xmin),min(ymin) dpi = fig.dpi if axis.lower() == "y": rotation=90. x = xmin-float(labelpad)/dpi y = 0.5 elif axis.lower() == 'x': rotation = 0. x = 0.5 y = ymin - float(labelpad)/dpi else: raise Exception("Unexpected axis: x or y") if label_prop is None: label_prop = dict() plt.gcf().text(x,y,label,rotation=rotation, transform=fig.transFigure, ha=ha,va=va, **label_prop) if y_label is not False: if not sbplt: if not piemode: if type(y_label) == str: plt.ylabel(y_label) else: plt.ylabel(y_l) else: if type(y_label) == str: the_y = y_label else: the_y = y_l #suplabel('y', the_y, labelpad = 1.5) plt.gcf().text(0.04, 0.5, the_y, va='center', rotation='vertical') #plt.subplots_adjust(left=0.5) # if not piemode: # if type(y_label) == str: # plt.ylabel(y_label) # else: # plt.ylabel(y_l) # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.gca().suptitle(title, fontsize = 16) #plt.subplots_adjust(top=0.9) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') # show grid a.grid(b=kwargs.get('grid', False)) kwargs.pop('grid', None) # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: if kind.startswith('bar'): width = ax.containers[0][0].get_width() # show grid a.grid(b=kwargs.get('grid', False)) kwargs.pop('grid', None) if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0,the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom') else: plt.annotate(score, (i, score), ha = 'center', va = 'bottom') plt.subplots_adjust(left=0.1) plt.subplots_adjust(bottom=0.18) if 'layout' not in kwargs: if not sbplt: plt.tight_layout() if save: import os if running_python_tex: imagefolder = '../images' else: imagefolder = 'images' savename = get_savename(imagefolder, save = save, title = title, ext = output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o'): plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format) else: plt.gcf().savefig(savename, dpi=150, format = output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print '\n' + time + ": " + savename + " created." else: raise ValueError("Error making %s." % savename) if dragmode: plt.legend().draggable() if sbplt: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) if not interactive and not running_python_tex and not running_spider \ and not tk: plt.gcf().show() return elif running_spider or tk: return plt if interactive: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) try: ax.legend_.remove() except: pass return mpld3.display()