예제 #1
0
파일: editor.py 프로젝트: muranava/corpkit
def editor(dataframe1, 
            operation = '%',
            dataframe2 = False,
            sort_by = False,
            keep_stats = True,
            keep_top = False,
            just_totals = False,
            threshold = 'medium',
            just_entries = False,
            skip_entries = False,
            merge_entries = False,
            newname = 'combine',
            just_subcorpora = False,
            skip_subcorpora = False,
            span_subcorpora = False,
            merge_subcorpora = False,
            new_subcorpus_name = False,
            replace_names = False,
            projection = False,
            remove_above_p = False,
            p = 0.05, 
            revert_year = True,
            print_info = True,
            convert_spelling = False,
            selfdrop = True,
            calc_all = True,
            **kwargs
            ):
    """Edit results of corpus interrogation.

    replace_names: give a tuple with (to_find, replacement)
                      give a raw string to delete"""

    import pandas
    import pandas as pd
    import numpy as np
    import re
    import collections
    from pandas import DataFrame, Series
    from time import localtime, strftime
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        have_ipython = False
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    pd.options.mode.chained_assignment = None

    the_time_started = strftime("%Y-%m-%d %H:%M:%S")
    
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

    from corpkit.tests import check_pytex
    if check_pytex():
        print_info = False

    def combiney(df, df2, operation = '%', threshold = 'medium', prinf = True):
        """mash df and df2 together in appropriate way"""
        # delete under threshold
        if just_totals:
            if using_totals:
                if not single_totals:
                    to_drop = list(df2[df2['Combined total'] < threshold].index)
                    df = df.drop([e for e in to_drop if e in list(df.index)])
                    if prinf:
                        to_show = []
                        [to_show.append(w) for w in to_drop[:5]]
                        if len(to_drop) > 10:
                            to_show.append('...')
                            [to_show.append(w) for w in to_drop[-5:]]
                        if len(to_drop) > 0:
                            print 'Removing %d entries below threshold:\n    %s' % (len(to_drop), '\n    '.join(to_show))
                        if len(to_drop) > 10:
                            print '... and %d more ... \n' % (len(to_drop) - len(to_show) + 1)
                        else:
                            print ''
                else:
                    denom = df2
        else:
            denom = list(df2)
        if single_totals:
            if operation == '%':
                df = df * 100.0
                df = df.div(denom, axis = 0)
            elif operation == '+':
                df = df.add(denom, axis = 0)
            elif operation == '-':
                df = df.sub(denom, axis = 0)
            elif operation == '*':
                df = df.mul(denom, axis = 0)
            elif operation == '/':
                df = df.div(denom, axis = 0)
            return df

        elif not single_totals:
            #p = ProgressBar(len(list(df.columns)))
            for index, entry in enumerate(list(df.columns)):
                #p.animate(index)
                if operation == '%':
                    try:
                        df[entry] = df[entry] * 100.0 / df2[entry]
                    except:
                        continue
                    #df.drop(entry, axis = 1, inplace = True)
                    #df[entry] = maths_done
                elif operation == '+':
                    try:
                        df[entry] = df[entry] + df2[entry]
                    except:
                        continue
                elif operation == '-':
                    try:
                        df[entry] = df[entry] - df2[entry]
                    except:
                        continue
                elif operation == '*':
                    try:
                        df[entry] = df[entry] * df2[entry]
                    except:
                        continue
                elif operation == '/':
                    try:
                        df[entry] = df[entry] / df2[entry]
                    except:
                        continue
            #p.animate(len(list(df.columns)))
            #if have_ipython:
                #clear_output()
        return df

    def parse_input(df, input):
        """turn whatever has been passed in into list of words that can be used as pandas indices"""
        import re
        
        if input == 'all':
            input = r'.*'

        if type(input) == int:
            input = [input]

        elif type(input) == str:
            try:
                regex = re.compile(input)
                parsed_input = [w for w in list(df) if re.search(regex, w)]
                return parsed_input
            except:
                input = [input]

        if type(input) == list:
            if type(input[0]) == int:
                parsed_input = [word for index, word in enumerate(list(df)) if index in input]
            elif type(input[0]) == str or type(input[0]) == unicode:
                parsed_input = [word for word in input if word in df.columns]

        return parsed_input


    def synonymise(df, pos = 'n'):
        """pass a df and a pos and convert df columns to most common synonyms"""
        import nltk
        from nltk.corpus import wordnet as wn
        #from dictionaries.taxonomies import taxonomies
        from collections import Counter
        fixed = []
        for w in list(df.columns):
            try:
                syns = []
                for syns in wn.synsets(w, pos = pos):
                    for w in syns:
                        synonyms.append(w)
                top_syn = Counter(syns).most_common(1)[0][0]
                fixed.append(top_syn)
            except:
                fixed.append(w)
        df.columns = fixed
        return df

    def convert_spell(df, convert_to = 'US', print_info = print_info):
        """turn dataframes into us/uk spelling"""
        from dictionaries.word_transforms import usa_convert
        if print_info:
            print 'Converting spelling ... \n'
        if convert_to == 'UK':
            usa_convert = {v: k for k, v in usa_convert.items()}
        fixed = []
        for val in list(df.columns):
            try:
                fixed.append(usa_convert[val])
            except:
                fixed.append(val)
        df.columns = fixed
        return df

    def merge_duplicates(df, print_info = print_info):
        if print_info:
            print 'Merging duplicate entries ... \n'
        # now we have to merge all duplicates
        for dup in df.columns.get_duplicates():
            #num_dupes = len(list(df[dup].columns))
            temp = df[dup].sum(axis = 1)
            #df = df.drop([dup for d in range(num_dupes)], axis = 1)
            df = df.drop(dup, axis = 1)
            df[dup] = temp
        return df

    def name_replacer(df, replace_names, print_info = print_info):
        import re        
        # double or single nest if need be
        if type(replace_names) == str:
            replace_names = [(replace_names, '')]
        if type(replace_names[0]) == str:
            replace_names = [replace_names]
        """replace entry names and merge"""
        for to_find, replacement in replace_names:
            if print_info:
                try:
                    print 'Replacing "%s" with "%s" ...\n' % (to_find, replacement)
                except:
                    print 'Deleting "%s" from entry names ...\n' % (to_find)
            to_find = re.compile(to_find)
            try:
                replacement = replacement
            except:
                replacement = ''
            df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)]
        df = merge_duplicates(df, print_info = False)
        return df

    def just_these_entries(df, parsed_input, prinf = True):
        entries = [word for word in list(df) if word not in parsed_input]
        if prinf:
            print 'Keeping %d entries:\n    %s' % (len(parsed_input), '\n    '.join(parsed_input[:10]))
            if len(parsed_input) > 10:
                print '... and %d more ... \n' % (len(parsed_input) - 10)
            else:
                print ''
        df = df.drop(entries, axis = 1)
        return df

    def skip_these_entries(df, parsed_input, prinf = True):
        if prinf:     
            print 'Skipping %d entries:\n    %s' % (len(parsed_input), '\n    '.join(parsed_input[:10]))
            if len(parsed_input) > 10:
                print '... and %d more ... \n' % (len(parsed_input) - 10)
            else:
                print ''
        df = df.drop(parsed_input, axis = 1)
        return df

    def newname_getter(df, parsed_input, newname = 'combine', prinf = True, merging_subcorpora = False):
        """makes appropriate name for merged entries"""
        if merging_subcorpora:
            if newname is False:
                newname = 'combine'
        if type(newname) == int:
            the_newname = list(df.columns)[newname]
        elif type(newname) == str:
            if newname == 'combine':
                if len(parsed_input) <= 3:
                    the_newname = '/'.join(parsed_input)
                elif len(parsed_input) > 3:
                    the_newname = '/'.join(parsed_input[:3]) + '...'
            else:
                the_newname = newname
        if newname is False:
            # revise this code
            import operator
            sumdict = {}
            for item in parsed_input:
                summed = sum(list(df[item]))
                sumdict[item] = summed
            the_newname = max(sumdict.iteritems(), key=operator.itemgetter(1))[0]
        if type(the_newname) != unicode:
            the_newname = unicode(the_newname, errors = 'ignore')
        return the_newname

    def merge_these_entries(df, parsed_input, the_newname, prinf = True, merging = 'entries'):
        # make new entry with sum of parsed input
        if prinf:
            print 'Merging %d %s as "%s":\n    %s' % (len(parsed_input), merging, the_newname, '\n    '.join(parsed_input[:10]))
            if len(parsed_input) > 10:
                print '... and %d more ... \n' % (len(parsed_input) - 10)
            else:
                print ''
        # remove old entries
        temp = sum([df[i] for i in parsed_input])
        df = df.drop(parsed_input, axis = 1)
        df[the_newname] = temp
        return df

    def just_these_subcorpora(df, lst_of_subcorpora, prinf = True):        
        if type(lst_of_subcorpora[0]) == int:
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if prinf:
            print 'Keeping %d subcorpora:\n    %s' % (len(good_years), '\n    '.join(good_years[:10]))
            if len(good_years) > 10:
                print '... and %d more ... \n' % (len(good_years) - 10)
            else:
                print ''
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis = 0)
        return df

    def skip_these_subcorpora(df, lst_of_subcorpora, prinf = True):
        if type(lst_of_subcorpora) == int:
            lst_of_subcorpora = [lst_of_subcorpora]
        if type(lst_of_subcorpora[0]) == int:
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if prinf:       
            print 'Skipping %d subcorpora:\n    %s' % (len(bad_years), '\n    '.join([str(i) for i in bad_years[:10]]))
            if len(bad_years) > 10:
                print '... and %d more ... \n' % (len(bad_years) - 10)
            else:
                print ''
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis = 0)
        return df

    def span_these_subcorpora(df, lst_of_subcorpora, prinf = True):
        non_totals = [subcorpus for subcorpus in list(df.index)]
        good_years = [subcorpus for subcorpus in non_totals if int(subcorpus) >= int(lst_of_subcorpora[0]) and int(subcorpus) <= int(lst_of_subcorpora[-1])]
        if prinf:        
            print 'Keeping subcorpora:\n    %d--%d\n' % (int(lst_of_subcorpora[0]), int(lst_of_subcorpora[-1]))
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis = 0)
        # retotal needed here
        return df

    def projector(df, list_of_tuples, prinf = True):
        for subcorpus, projection_value in list_of_tuples:
            if type(subcorpus) == int:
                subcorpus = str(subcorpus)
            df.ix[subcorpus] = df.ix[subcorpus] * projection_value
            if prinf:
                if type(projection_value) == float:
                    print 'Projection: %s * %s' % (subcorpus, projection_value)
                if type(projection_value) == int:
                    print 'Projection: %s * %d' % (subcorpus, projection_value)
        if prinf:
            print ''
        return df

    def do_stats(df):
        """do linregress and add to df"""
        from scipy.stats import linregress
        entries = []
        slopes = []
        intercepts = []
        rs = []
        ps = []
        stderrs = []
        indices = list(df.index)
        first_year = list(df.index)[0]
        try:
            x = [int(y) - int(first_year) for y in indices]
        except ValueError:
            x = range(len(indices))
        statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
        for entry in list(df.columns):
            entries.append(entry)
            y = list(df[entry])
            slope, intercept, r, p, stderr = linregress(x, y)
            slopes.append(slope)
            intercepts.append(intercept)
            rs.append(r)
            ps.append(p)
            stderrs.append(stderr)
        sl = pd.DataFrame([slopes, intercepts, rs, ps, stderrs], 
                           index = statfields, 
                           columns = list(df.columns))
        df = df.append(sl)
        # drop infinites and nans
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)
        return df

    def recalc(df, operation = '%'):
        statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
        """Add totals to the dataframe1"""

        #df.drop('Total', axis = 0, inplace = True)
        #df.drop('Total', axis = 1, inplace = True)
        try:
            df['temp-Total'] = df.drop(statfields).sum(axis = 1)
        except:
            df['temp-Total'] = df.sum(axis = 1)
        df = df.T
        try:
            df['temp-Total'] = df.drop(statfields).sum(axis = 1)
        except:
            df['temp-Total'] = df.sum(axis = 1)
        df = df.T

        # make totals percentages if need be.
        #if operation == '%':
        #    df['Total'] = df['Total'] * 100.0
        #    df['Total'] = df['Total'].div(list(df2), axis = 0)
        #elif operation == '+':
        #    df['Total'] = df['Total'].add(list(df2), axis = 0)
        #elif operation == '-':
        #    df['Total'] = df['Total'].sub(list(df2), axis = 0)
        #elif operation == '*':
        #    df['Total'] = df['Total'].mul(list(df2), axis = 0)
        #elif operation == '/':
        #    df['Total'] = df['Total'].div(list(df2), axis = 0)
        return df

    def resort(df, sort_by = False, keep_stats = False):
        """sort results"""
        # translate options and make sure they are parseable
        

        options = ['total', 'name', 'infreq', 'increase', 'turbulent',
                   'decrease', 'static', 'most', 'least', 'none']

        if sort_by is True:
            sort_by = 'total'
        if sort_by == 'most':
            sort_by = 'total'
        if sort_by == 'least':
            sort_by = 'infreq'
        if sort_by not in options:
            raise ValueError("sort_by parameter error: '%s' not recognised. Must be True, False, %s" % (sort_by, ', '.join(options)))

        if operation.startswith('k'):
            if type(df) == pandas.core.series.Series:
                if sort_by == 'total':
                    df = df.order(ascending = False)

                elif sort_by == 'infreq':
                    df = df.order(ascending = True)

                elif sort_by == 'name':
                    df = df.sort_index()
                return df

        if just_totals:
            if sort_by == 'infreq':
                df = df.sort(columns = 'Combined total', ascending = True)
            elif sort_by == 'total':
                df = df.sort(columns = 'Combined total', ascending = False)
            elif sort_by == 'name':
                df = df.sort_index()
            return df

        # this is really shitty now that i know how to sort, like in the above
        if keep_stats:
            df = do_stats(df)
        if sort_by == 'total':
            if df1_istotals:
                df = df.T
            df = recalc(df, operation = operation)
            tot = df.ix['temp-Total']
            df = df[tot.argsort()[::-1]]
            df = df.drop('temp-Total', axis = 0)
            df = df.drop('temp-Total', axis = 1)
            if df1_istotals:
                df = df.T
        elif sort_by == 'infreq':
            if df1_istotals:
                df = df.T
            df = recalc(df, operation = operation)
            tot = df.ix['temp-Total']
            df = df[tot.argsort()]
            df = df.drop('temp-Total', axis = 0)
            df = df.drop('temp-Total', axis = 1)
            if df1_istotals:
                df = df.T
        elif sort_by == 'name':
            # currently case sensitive...
            df = df.reindex_axis(sorted(df.columns), axis=1)
        else:
            statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
            
            if not keep_stats:
                df = do_stats(df)

            slopes = df.ix['slope']
            if sort_by == 'increase':
                df = df[slopes.argsort()[::-1]]
            elif sort_by == 'decrease':
                df = df[slopes.argsort()]
            elif sort_by == 'static':
                df = df[slopes.abs().argsort()]
            elif sort_by == 'turbulent':
                df = df[slopes.abs().argsort()[::-1]]
            if remove_above_p:
                # the easy way to do it!
                df = df.T
                df = df[df['p'] <= p]
                df = df.T

            # remove stats field by default
            if not keep_stats:
                df = df.drop(statfields, axis = 0)

        return df

    def set_threshold(big_list, threshold, prinf = True, for_keywords = False):
        if type(threshold) == str:
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500

            if type(big_list) == pandas.core.frame.DataFrame:
                tot = big_list.sum().sum()

            if type(big_list) == pandas.core.series.Series:
                tot = big_list.sum()
            the_threshold = float(tot) / float(denominator)
            #if for_keywords:
                #the_threshold = the_threshold / 2
        else:
            the_threshold = threshold
        if prinf:
            print 'Threshold: %d\n' % the_threshold
        return the_threshold

#####################################################

    # check if we're in concordance mode
    try:
        if list(dataframe1.columns) == ['l', 'm', 'r']:
            conc_lines = True
        else:
            conc_lines = False
    except:
        conc_lines = False

    # copy dataframe to be very safe
    try:
        df = dataframe1.copy()
    except AttributeError:
        no_good_dataframe1 = True
        while no_good_dataframe1:
            if 'interrogation' in str(type(dataframe1)):
                sel = raw_input("\nIt looks like you're trying to edit an interrogation, " \
                                  "rather than an interrogation's .results or .totals branch. You can:\n\n    a) select .results branch\n    b) select .totals branch\n    c) exit\n\nYour choice: ")
                if sel.startswith('a'):
                    try:
                        dataframe1 = dataframe1.results
                        no_good_dataframe1 = False
                    except:
                        pass
                elif sel.startswith('b'):
                    try:
                        dataframe1 = dataframe1.totals
                        no_good_dataframe1 = False
                    except:
                        pass
                else:
                    return
            else:
                raise ValueError("Thing to be edited (dataframe1) needs to be a Pandas DataFrame or Series. " \
                                  "Right now, its type is: '%s'." % type(dataframe1).__name__)

        df = dataframe1.copy()
    
    # do concordance work
    if conc_lines:
        df = dataframe1.copy()

        if just_entries:
            if type(just_entries) == int:
                just_entries = [just_entries]
            if type(just_entries) == str:
                df = df[df['m'].str.contains(just_entries)]
            if type(just_entries) == list:
                if type(just_entries[0]) == str:
                    regex = re.compile(r'(?i)^(' + r'|'.join(just_entries) + r')$')
                    df = df[df['m'].str.contains(regex)]
                else:
                    df = df.ix[just_entries].reset_index(drop = True)

        if skip_entries:
            if type(skip_entries) == int:
                skip_entries = [skip_entries]
            if type(skip_entries) == str:
                df = df[~df['m'].str.contains(skip_entries)]
            if type(skip_entries) == list:
                if type(skip_entries[0]) == str:
                    regex = re.compile(r'(?i)^(' + r'|'.join(skip_entries) + r')$')
                    df = df[~df['m'].str.contains(regex)]
                else:
                    df = df.ix[[e for e in list(df.index) if e not in skip_entries]].reset_index(drop = True)

        return df

    if print_info:
        print '\n***Processing results***\n========================\n'

    df1_istotals = False
    if type(df) == pandas.core.series.Series:
        df1_istotals = True
        df = pandas.DataFrame(df)
        # if just a single result
    else:
        df = pandas.DataFrame(df)
    if operation.startswith('k'):
        if sort_by is False:
            if not df1_istotals:
                sort_by = 'turbulent'
        if df1_istotals:
            df = df.T
    
    # figure out if there's a second list
    # copy and remove totals if there is
    single_totals = True
    using_totals = False
    outputmode = False

    try:
        if dataframe2.empty is False:            
            df2 = dataframe2.copy()
            using_totals = True

            if type(df2) == pandas.core.frame.DataFrame:
                if len(df2.columns) > 1:
                    single_totals = False
                else:
                    df2 = pd.Series(df2)
            elif type(df2) == pandas.core.series.Series:
                single_totals = True
            else:
                raise ValueError('dataframe2 not recognised.')   
    except AttributeError:
        if dataframe2 == 'self':
            outputmode = True

    if projection:
        # projection shouldn't do anything when working with '%', remember.
        df = projector(df, projection)
        if using_totals:
            df2 = projector(df2, projection)

    if convert_spelling:
        df = convert_spell(df, convert_to = convert_spelling)
        df = merge_duplicates(df)

        if not single_totals:
            df2 = convert_spell(df2, convert_to = convert_spelling, print_info = False)
            df2 = merge_duplicates(df2, print_info = False)
        if not df1_istotals:
            sort_by = 'total'

    if replace_names:
        df = name_replacer(df, replace_names)
        df = merge_duplicates(df)
        if not single_totals:
            df2 = name_replacer(df2, print_info = False)
            df2 = merge_duplicates(df2, print_info = False)
        if not sort_by:
            sort_by = 'total'

    # remove old stats if they're there:
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        df = df.drop(statfields, axis = 0)
    except:
        pass
    if using_totals:
        try:
            df2 = df2.drop(statfields, axis = 0)
        except:
            pass

    # merging
    if merge_entries:
        the_newname = newname_getter(df, parse_input(df, merge_entries), newname = newname, prinf = print_info)
    
        df = merge_these_entries(df, parse_input(df, merge_entries), the_newname, prinf = print_info)
        if not single_totals:
            df2 = merge_these_entries(df2, parse_input(df2, merge_entries), the_newname, prinf = False)
    
    if merge_subcorpora:
        the_newname = newname_getter(df.T, parse_input(df.T, merge_subcorpora), 
                                     newname = new_subcorpus_name, 
                                     merging_subcorpora = True,
                                     prinf = print_info)
        df = merge_these_entries(df.T, parse_input(df.T, merge_subcorpora), the_newname, merging = 'subcorpora', prinf = print_info).T
        if using_totals:
            df2 = merge_these_entries(df2.T, parse_input(df2.T, merge_subcorpora), the_newname, merging = 'subcorpora', prinf = print_info).T
    
    if just_subcorpora:
        df = just_these_subcorpora(df, just_subcorpora, prinf = print_info)
        if using_totals:
            df2 = just_these_subcorpora(df2, just_subcorpora, prinf = False)
    
    if skip_subcorpora:
        df = skip_these_subcorpora(df, skip_subcorpora, prinf = print_info)
        if using_totals:
            df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf = False)
    
    if span_subcorpora:
        df = span_these_subcorpora(df, span_subcorpora, prinf = print_info)
        if using_totals:
            df2 = span_these_subcorpora(df2, span_subcorpora, prinf = False)

    if just_entries:
        df = just_these_entries(df, parse_input(df, just_entries), prinf = print_info)
        if not single_totals:
            df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf = False)
    if skip_entries:
        df = skip_these_entries(df, parse_input(df, skip_entries), prinf = print_info)
        if not single_totals:
            df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf = False)

    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    # make just_totals as dataframe
    just_one_total_number = False
    if just_totals:
        df = pd.DataFrame(df.sum(), columns = ['Combined total'])
        if using_totals:
            if not single_totals:
                df2 = pd.DataFrame(df2.sum(), columns = ['Combined total'])
            else:
                just_one_total_number = True
                df2 = df2.sum()

    # generate '%' totals here ...
    if using_totals:
        if operation == '%':
            tot1 = df.T.sum()
            if single_totals:
                tot2 = df2
            else:
                tot2 = df2.T.sum()
            total = tot1 * 100.0 / tot2

    # combine lists
    if using_totals or outputmode:
        if not operation.startswith('k'):
            the_threshold = 0
            # set a threshold if just_totals
            if outputmode is True:
                df2 = df.T.sum()
                if not just_totals:
                    df2.name = 'Total'
                else:
                    df2.name = 'Combined total'
                using_totals = True
                single_totals = True
            if just_totals:
                if not single_totals:
                    the_threshold = set_threshold(df2, threshold, prinf = print_info)
            df = combiney(df, df2, operation = operation, threshold = the_threshold, prinf = print_info)
    
    # if doing keywording...
    if operation.startswith('k'):
        from corpkit.keys import keywords

        # allow saved dicts to be df2, etc
        try:
            if dataframe2 == 'self':
                df2 = df.copy()
        except TypeError:
            pass
        if type(dataframe2) == str:
            if dataframe2 != 'self':
                df2 = dataframe2
    
        else:
            the_threshold = False

        df = keywords(df, df2, 
                      selfdrop = selfdrop, 
                      threshold = threshold, 
                      printstatus = print_info,
                      editing = True,
                      calc_all = calc_all,
                      **kwargs)

        # eh?
        df = df.T
    
    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    # resort data
    if sort_by:
        df = resort(df, keep_stats = keep_stats, sort_by = sort_by)

    if keep_top:
        if not just_totals:
            df = df[list(df.columns)[:keep_top]]
        else:
            df = df.head(keep_top)

    if just_totals:
        # turn just_totals into series:
        df = pd.Series(df['Combined total'], name = 'Combined total')

    if df1_istotals:
        if operation.startswith('k'):
            try:
                df = pd.Series(df.ix[dataframe1.name])
                df.name = '%s: keyness' % df.name
            except:
                df = df.iloc[0,:]
                df.name = 'keyness' % df.name

    # generate totals branch if not percentage results:
    if df1_istotals or operation.startswith('k'):
        try:
            total = pd.Series(df['Total'], name = 'Total')
        except:
            total = 'none'
        #total = df.copy()
    else:
        # might be wrong if using division or something...
        total = df.T.sum()

    if type(df) == pandas.core.frame.DataFrame:
        datatype = df.ix[0].dtype
    else:
        datatype = df.dtype
    #make named_tuple
    the_operation = 'none'
    if using_totals:
        the_operation = operation

    the_options = {}
    the_options['time_started'] = the_time_started
    the_options['function'] = 'editor'
    the_options['dataframe1'] = dataframe1
    the_options['operation'] = the_operation
    the_options['dataframe2'] = dataframe2
    the_options['datatype'] = datatype
    the_options['sort_by'] = sort_by
    the_options['keep_stats'] = keep_stats
    the_options['just_totals'] = just_totals
    the_options['threshold'] = threshold # can be wrong!
    the_options['just_entries'] = just_entries
    the_options['just_entries'] = just_entries
    the_options['skip_entries'] = skip_entries
    the_options['merge_entries'] = merge_entries
    the_options['newname'] = newname
    the_options['just_subcorpora'] = just_subcorpora
    the_options['skip_subcorpora'] = skip_subcorpora
    the_options['span_subcorpora'] = span_subcorpora
    the_options['merge_subcorpora'] = merge_subcorpora
    the_options['new_subcorpus_name'] = new_subcorpus_name
    the_options['projection'] = projection
    the_options['remove_above_p'] = remove_above_p
    the_options['p'] = p
    the_options['revert_year'] = revert_year
    the_options['print_info'] = print_info

    outputnames = collections.namedtuple('edited_interrogation', ['query', 'results', 'totals'])
    output = outputnames(the_options, df, total)

    #print '\nResult (sample)\n'
    if print_info:
        #if merge_entries or merge_subcorpora or span_subcorpora or just_subcorpora or \
           #just_entries or skip_entries or skip_subcorpora or printed_th or projection:
        print '***Done!***\n========================\n'
    #print df.head().T
    #print ''
    if operation.startswith('k') or just_totals or df1_istotals:
        pd.set_option('display.max_rows', 30)
    else:
        pd.set_option('display.max_rows', 15)
    pd.set_option('display.max_columns', 8)
    pd.set_option('max_colwidth',70)
    pd.set_option('display.width', 800)
    pd.set_option('expand_frame_repr', False)
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

    return output
예제 #2
0
def editor(interrogation,
           operation=None,
           denominator=False,
           sort_by=False,
           keep_stats=False,
           keep_top=False,
           just_totals=False,
           threshold='medium',
           just_entries=False,
           skip_entries=False,
           span_entries=False,
           merge_entries=False,
           just_subcorpora=False,
           skip_subcorpora=False,
           span_subcorpora=False,
           merge_subcorpora=False,
           replace_names=False,
           replace_subcorpus_names=False,
           projection=False,
           remove_above_p=False,
           p=0.05,
           print_info=False,
           spelling=False,
           selfdrop=True,
           calc_all=True,
           keyword_measure='ll',
           **kwargs):
    """
    See corpkit.interrogation.Interrogation.edit() for docstring
    """

    # grab arguments, in case we get dict input and have to iterate
    locs = locals()

    import corpkit

    import re
    import collections
    import pandas as pd
    import numpy as np

    from pandas import DataFrame, Series
    from time import localtime, strftime

    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        have_ipython = False
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass
    # new ipython error
    except AttributeError:
        have_ipython = False
        pass

    # to use if we also need to worry about concordance lines
    return_conc = False

    from corpkit.interrogation import Interrodict, Interrogation, Concordance
    if interrogation.__class__ == Interrodict:
        locs.pop('interrogation', None)
        from collections import OrderedDict
        outdict = OrderedDict()
        for i, (k, v) in enumerate(interrogation.items()):
            # only print the first time around
            if i != 0:
                locs['print_info'] = False

            if isinstance(denominator,
                          STRINGTYPE) and denominator.lower() == 'self':
                denominator = interrogation

            # if df2 is also a dict, get the relevant entry

            if isinstance(denominator, (dict, Interrodict)):
                #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \
                #   sorted(set([i.lower() for i in list(denominator.keys())])):
                #   locs['denominator'] = denominator[k]

                # fix: this repeats itself for every key, when it doesn't need to
                # denominator_sum:
                if kwargs.get('denominator_sum'):
                    locs['denominator'] = denominator.collapse(axis='key')

                if kwargs.get('denominator_totals'):
                    locs['denominator'] = denominator[k].totals
                else:
                    locs['denominator'] = denominator[k].results

            outdict[k] = v.results.edit(**locs)
        if print_info:

            thetime = strftime("%H:%M:%S", localtime())
            print(
                "\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n"
                % (thetime, "'\n         '".join(sorted(outdict.keys()))))
        return Interrodict(outdict)

    elif isinstance(interrogation, (DataFrame, Series)):
        dataframe1 = interrogation
    elif isinstance(interrogation, Interrogation):
        #if interrogation.__dict__.get('concordance', None) is not None:
        #    concordances = interrogation.concordance
        branch = kwargs.pop('branch', 'results')
        if branch.lower().startswith('r'):
            dataframe1 = interrogation.results
        elif branch.lower().startswith('t'):
            dataframe1 = interrogation.totals
        elif branch.lower().startswith('c'):
            dataframe1 = interrogation.concordance
            return_conc = True
        else:
            dataframe1 = interrogation.results

    elif isinstance(interrogation, Concordance) or \
                        all(x in list(dataframe1.columns) for x in [ 'l', 'm', 'r']):
        return_conc = True
        print('heree')
        dataframe1 = interrogation
    # hope for the best
    else:
        dataframe1 = interrogation

    the_time_started = strftime("%Y-%m-%d %H:%M:%S")

    pd.options.mode.chained_assignment = None

    try:
        from process import checkstack
    except ImportError:
        from corpkit.process import checkstack

    if checkstack('pythontex'):
        print_info = False

    def combiney(df, df2, operation='%', threshold='medium', prinf=True):
        """
        Mash df and df2 together in appropriate way
        """
        totals = False
        # delete under threshold
        if just_totals:
            if using_totals:
                if not single_totals:
                    to_drop = list(
                        df2[df2['Combined total'] < threshold].index)
                    df = df.drop([e for e in to_drop if e in list(df.index)])
                    if prinf:
                        to_show = []
                        [to_show.append(w) for w in to_drop[:5]]
                        if len(to_drop) > 10:
                            to_show.append('...')
                            [to_show.append(w) for w in to_drop[-5:]]
                        if len(to_drop) > 0:
                            print(
                                'Removing %d entries below threshold:\n    %s'
                                % (len(to_drop), '\n    '.join(to_show)))
                        if len(to_drop) > 10:
                            print('... and %d more ... \n' %
                                  (len(to_drop) - len(to_show) + 1))
                        else:
                            print('')
                else:
                    denom = df2
        else:
            denom = list(df2)
        if single_totals:
            if operation == '%':
                totals = df.sum() * 100.0 / float(df.sum().sum())
                df = df * 100.0
                try:
                    df = df.div(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '+':
                try:
                    df = df.add(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '-':
                try:
                    df = df.sub(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '*':
                totals = df.sum() * float(df.sum().sum())
                try:
                    df = df.mul(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '/':
                try:
                    totals = df.sum() / float(df.sum().sum())
                    df = df.div(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)

            elif operation == 'a':
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2

            elif operation.startswith('c'):
                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    df = pandas.concat([df, df2], axis=1)
            return df, totals

        elif not single_totals:
            if not operation.startswith('a'):
                # generate totals
                if operation == '%':
                    totals = df.sum() * 100.0 / float(df2.sum().sum())
                if operation == '*':
                    totals = df.sum() * float(df2.sum().sum())
                if operation == '/':
                    totals = df.sum() / float(df2.sum().sum())
                if operation.startswith('c'):
                    # add here the info that merging will not work
                    # with identical colnames
                    import warnings
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        d = pd.concat([df.T, df2.T])
                        # make index nums
                        d = d.reset_index()
                        # sum and remove duplicates
                        d = d.groupby('index').sum()
                        dx = d.reset_index('index')
                        dx.index = list(dx['index'])
                        df = dx.drop('index', axis=1).T

                def editf(datum):
                    meth = {
                        '%': datum.div,
                        '*': datum.mul,
                        '/': datum.div,
                        '+': datum.add,
                        '-': datum.sub
                    }

                    if datum.name in list(df2.columns):

                        method = meth[operation]
                        mathed = method(df2[datum.name], fill_value=0.0)
                        if operation == '%':
                            return mathed * 100.0
                        else:
                            return mathed
                    else:
                        return datum * 0.0

                df = df.apply(editf)

            else:
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2.T.sum()

        return df, totals

    def skip_keep_merge_span(df):
        """
        Do all skipping, keeping, merging and spanning
        """
        from corpkit.dictionaries.process_types import Wordlist
        if skip_entries:
            if isinstance(skip_entries, (list, Wordlist)):
                df = df.drop(list(skip_entries), axis=1, errors='ignore')
            else:
                df = df.loc[:, ~df.columns.str.contains(skip_entries)]
        if just_entries:
            if isinstance(just_entries, (list, Wordlist)):
                je = [i for i in list(just_entries) if i in list(df.columns)]
                df = df[je]
            else:
                df = df.loc[:, df.columns.str.contains(just_entries)]
        if merge_entries:
            for newname, crit in merge_entries.items():
                if isinstance(crit, (list, Wordlist)):
                    crit = [i for i in list(crit) if i in list(df.columns)]
                    cr = [i for i in list(crit) if i in list(df.columns)]
                    summed = df[cr].sum(axis=1)
                    df = df.drop(list(cr), axis=1, errors='ignore')
                else:
                    summed = df.loc[:,
                                    df.columns.str.contains(crit)].sum(axis=1)
                    df = df.loc[:, ~df.columns.str.contains(crit)]
                df.insert(0, newname, summed, allow_duplicates=True)
        if span_entries:
            df = df.iloc[:, span_entries[0]:span_entries[1]]
        if skip_subcorpora:
            if isinstance(skip_subcorpora, (list, Wordlist)):
                df = df.drop(list(skip_subcorpora), axis=0, errors='ignore')
            else:
                df = df[~df.index.str.contains(skip_subcorpora)]
        if just_subcorpora:
            if isinstance(just_subcorpora, (list, Wordlist)):
                js = [i for i in list(just_subcorpora) if i in list(df.index)]
                df = df.loc[js]
            else:
                df = df[df.index.str.contains(just_subcorpora)]
        if merge_subcorpora:
            df = df.T
            for newname, crit in merge_subcorpora.items():
                if isinstance(crit, (list, Wordlist)):
                    crit = [i for i in list(crit) if i in list(df.columns)]
                    summed = df[list(crit)].sum(axis=1)
                    df = df.drop(list(crit), axis=1, errors='ignore')
                else:
                    summed = df.loc[:,
                                    df.columns.str.contains(crit)].sum(axis=1)
                    df = df.loc[:, ~df.columns.str.contains(crit)]
                df.insert(0, newname, summed, allow_duplicates=True)
            df = df.T
        if span_subcorpora:
            df = df.iloc[span_subcorpora[0]:span_subcorpora[1], :]
        return df

    def parse_input(df, the_input):
        """turn whatever has been passed in into list of words that can 
           be used as pandas indices---maybe a bad way to go about it"""
        parsed_input = False
        import re
        if the_input == 'all':
            the_input = r'.*'
        if isinstance(the_input, int):
            try:
                the_input = str(the_input)
            except:
                pass
            the_input = [the_input]
        elif isinstance(the_input, STRINGTYPE):
            regex = re.compile(the_input)
            parsed_input = [w for w in list(df) if re.search(regex, w)]
            return parsed_input
        from corpkit.dictionaries.process_types import Wordlist
        if isinstance(the_input, Wordlist) or the_input.__class__ == Wordlist:
            the_input = list(the_input)
        if isinstance(the_input, list):
            if isinstance(the_input[0], int):
                parsed_input = [
                    word for index, word in enumerate(list(df))
                    if index in the_input
                ]
            elif isinstance(the_input[0], STRINGTYPE):
                try:
                    parsed_input = [
                        word for word in the_input if word in df.columns
                    ]
                except AttributeError:  # if series
                    parsed_input = [
                        word for word in the_input if word in df.index
                    ]
        return parsed_input

    def synonymise(df, pos='n'):
        """pass a df and a pos and convert df columns to most common synonyms"""
        from nltk.corpus import wordnet as wn
        #from dictionaries.taxonomies import taxonomies
        from collections import Counter
        fixed = []
        for w in list(df.columns):
            try:
                syns = []
                for syns in wn.synsets(w, pos=pos):
                    for w in syns:
                        synonyms.append(w)
                top_syn = Counter(syns).most_common(1)[0][0]
                fixed.append(top_syn)
            except:
                fixed.append(w)
        df.columns = fixed
        return df

    def convert_spell(df, convert_to='US', print_info=print_info):
        """turn dataframes into us/uk spelling"""
        from dictionaries.word_transforms import usa_convert
        if print_info:
            print('Converting spelling ... \n')
        if convert_to == 'UK':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        fixed = []
        for val in list(df.columns):
            try:
                fixed.append(usa_convert[val])
            except:
                fixed.append(val)
        df.columns = fixed
        return df

    def merge_duplicates(df, print_info=print_info):
        if print_info:
            print('Merging duplicate entries ... \n')
        # now we have to merge all duplicates
        for dup in df.columns.get_duplicates():
            #num_dupes = len(list(df[dup].columns))
            temp = df[dup].sum(axis=1)
            #df = df.drop([dup for d in range(num_dupes)], axis=1)
            df = df.drop(dup, axis=1)
            df[dup] = temp
        return df

    def name_replacer(df, replace_names, print_info=print_info):
        """replace entry names and merge"""
        import re
        # get input into list of tuples
        # if it's a string, we want to delete it
        if isinstance(replace_names, STRINGTYPE):
            replace_names = [(replace_names, '')]
        # this is for some malformed list
        if not isinstance(replace_names, dict):
            if isinstance(replace_names[0], STRINGTYPE):
                replace_names = [replace_names]
        # if dict, make into list of tupes
        if isinstance(replace_names, dict):
            replace_names = [(v, k) for k, v in replace_names.items()]
        for to_find, replacement in replace_names:
            if print_info:
                if replacement:
                    print('Replacing "%s" with "%s" ...\n' %
                          (to_find, replacement))
                else:
                    print('Deleting "%s" from entry names ...\n' % to_find)
            to_find = re.compile(to_find)
            if not replacement:
                replacement = ''
            df.columns = [
                re.sub(to_find, replacement, l) for l in list(df.columns)
            ]
        df = merge_duplicates(df, print_info=False)
        return df

    def newname_getter(df,
                       parsed_input,
                       newname='combine',
                       prinf=True,
                       merging_subcorpora=False):
        """makes appropriate name for merged entries"""
        if merging_subcorpora:
            if newname is False:
                newname = 'combine'
        if isinstance(newname, int):
            the_newname = list(df.columns)[newname]
        elif isinstance(newname, STRINGTYPE):
            if newname == 'combine':
                if len(parsed_input) <= 3:
                    the_newname = '/'.join(parsed_input)
                elif len(parsed_input) > 3:
                    the_newname = '/'.join(parsed_input[:3]) + '...'
            else:
                the_newname = newname
        if not newname:
            # revise this code
            import operator
            sumdict = {}
            for item in parsed_input:
                summed = sum(list(df[item]))
                sumdict[item] = summed
            the_newname = max(iter(sumdict.items()),
                              key=operator.itemgetter(1))[0]
        if not isinstance(the_newname, STRINGTYPE):
            the_newname = str(the_newname, errors='ignore')
        return the_newname

    def projector(df, list_of_tuples, prinf=True):
        """project abs values"""
        if isinstance(list_of_tuples, list):
            tdict = {}
            for a, b in list_of_tuples:
                tdict[a] = b
            list_of_tuples = tdict
        for subcorpus, projection_value in list(list_of_tuples.items()):
            if isinstance(subcorpus, int):
                subcorpus = str(subcorpus)
            df.ix[subcorpus] = df.ix[subcorpus] * projection_value
            if prinf:
                if isinstance(projection_value, float):
                    print('Projection: %s * %s' %
                          (subcorpus, projection_value))
                if isinstance(projection_value, int):
                    print('Projection: %s * %d' %
                          (subcorpus, projection_value))
        if prinf:
            print('')
        return df

    def lingres(ser, index):
        from scipy.stats import linregress
        from pandas import Series
        ix = ['slope', 'intercept', 'r', 'p', 'stderr']
        return Series(linregress(index, ser.values), index=ix)

    def do_stats(df):
        """do linregress and add to df"""

        try:
            from scipy.stats import linregress
        except ImportError:
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: sort type not available in this version of corpkit.' %
                  thetime)
            return False

        indices = list(df.index)
        first_year = list(df.index)[0]
        try:
            x = [int(y) - int(first_year) for y in indices]
        except ValueError:
            x = list(range(len(indices)))

        stats = df.apply(lingres, axis=0, index=x)
        df = df.append(stats)
        df = df.replace([np.inf, -np.inf], 0.0)

        return df

    def resort(df, sort_by=False, keep_stats=False):
        """
        Sort results, potentially using scipy's linregress
        """

        # translate options and make sure they are parseable
        stat_field = ['slope', 'intercept', 'r', 'p', 'stderr']
        easy_sorts = ['total', 'infreq', 'name', 'most', 'least', 'reverse']
        stat_sorts = ['increase', 'decrease', 'static', 'turbulent']
        options = stat_field + easy_sorts + stat_sorts
        sort_by_convert = {'most': 'total', True: 'total', 'least': 'infreq'}
        sort_by = sort_by_convert.get(sort_by, sort_by)

        # probably broken :(
        if just_totals:
            if sort_by == 'name':
                return df.sort_index()
            else:
                return df.sort_values(by='Combined total',
                                      ascending=sort_by != 'total',
                                      axis=1)

        stats_done = False
        if keep_stats or sort_by in stat_field + stat_sorts:
            df = do_stats(df)
            stats_done = True
            if isinstance(df, bool):
                if df is False:
                    return False

        if isinstance(df, Series):
            if stats_done:
                stats = df.ix[range(-5, 0)]
                df = df.drop(list(stats.index))
            if sort_by == 'name':
                df = df.sort_index()
            elif sort_by == 'reverse':
                df = df[::-1]
            else:
                df = df.sort_values(ascending=sort_by != 'total')
            if stats_done:
                df = df.append(stats)
            return df

        if sort_by == 'name':
            # currently case sensitive
            df = df.reindex_axis(sorted(df.columns), axis=1)
        elif sort_by in ['total', 'infreq']:
            if df1_istotals:
                df = df.T
            df = df[list(
                df.sum().sort_values(ascending=sort_by != 'total').index)]

        elif sort_by == 'reverse':
            df = df.T[::-1].T
        # sort by slope etc., or search by subcorpus name
        if sort_by in stat_field or sort_by not in options:
            asc = kwargs.get('reverse', False)
            df = df.T.sort_values(by=sort_by, ascending=asc).T

        if sort_by in ['increase', 'decrease', 'static', 'turbulent']:
            slopes = df.ix['slope']
            if sort_by == 'increase':
                df = df[slopes.argsort()[::-1]]
            elif sort_by == 'decrease':
                df = df[slopes.argsort()]
            elif sort_by == 'static':
                df = df[slopes.abs().argsort()]
            elif sort_by == 'turbulent':
                df = df[slopes.abs().argsort()[::-1]]
            if remove_above_p:
                df = df.T
                df = df[df['p'] <= p]
                df = df.T

        # remove stats field by default
        if not keep_stats:
            df = df.drop(stat_field, axis=0, errors='ignore')
        return df

    def set_threshold(big_list, threshold, prinf=True):
        if isinstance(threshold, STRINGTYPE):
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500
            if isinstance(big_list, DataFrame):
                tot = big_list.sum().sum()

            if isinstance(big_list, Series):
                tot = big_list.sum()
            tshld = float(tot) / float(denominator)
        else:
            tshld = threshold
        if prinf:
            print('Threshold: %d\n' % tshld)
        return tshld

    # copy dataframe to be very safe
    df = dataframe1.copy()
    # make cols into strings
    try:
        df.columns = [str(c) for c in list(df.columns)]
    except:
        pass

    if operation is None:
        operation = 'None'

    if isinstance(interrogation, Concordance):
        return_conc = True
    # do concordance work
    if return_conc:
        if just_entries:
            if isinstance(just_entries, int):
                just_entries = [just_entries]
            if isinstance(just_entries, STRINGTYPE):
                df = df[df['m'].str.contains(just_entries)]
            if isinstance(just_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in just_entries):
                    mp = df['m'].map(lambda x: x in just_entries)
                    df = df[mp]
                else:
                    df = df.ix[just_entries]

        if skip_entries:
            if isinstance(skip_entries, int):
                skip_entries = [skip_entries]
            if isinstance(skip_entries, STRINGTYPE):
                df = df[~df['m'].str.contains(skip_entries)]
            if isinstance(skip_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_entries):
                    mp = df['m'].map(lambda x: x not in skip_entries)
                    df = df[mp]
                else:
                    df = df.drop(skip_entries, axis=0)

        if just_subcorpora:
            if isinstance(just_subcorpora, int):
                just_subcorpora = [just_subcorpora]
            if isinstance(just_subcorpora, STRINGTYPE):
                df = df[df['c'].str.contains(just_subcorpora)]
            if isinstance(just_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in just_subcorpora):
                    mp = df['c'].map(lambda x: x in just_subcorpora)
                    df = df[mp]
                else:
                    df = df.ix[just_subcorpora]

        if skip_subcorpora:
            if isinstance(skip_subcorpora, int):
                skip_subcorpora = [skip_subcorpora]
            if isinstance(skip_subcorpora, STRINGTYPE):
                df = df[~df['c'].str.contains(skip_subcorpora)]
            if isinstance(skip_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_subcorpora):
                    mp = df['c'].map(lambda x: x not in skip_subcorpora)
                    df = df[mp]
                else:
                    df = df.drop(skip_subcorpora, axis=0)

        return Concordance(df)

    if print_info:
        print('\n***Processing results***\n========================\n')

    df1_istotals = False
    if isinstance(df, Series):
        df1_istotals = True
        df = DataFrame(df)
        # if just a single result
    else:
        df = DataFrame(df)
    if operation.startswith('k'):
        if sort_by is False:
            if not df1_istotals:
                sort_by = 'turbulent'
        if df1_istotals:
            df = df.T

    # figure out if there's a second list
    # copy and remove totals if there is
    single_totals = True
    using_totals = False
    outputmode = False

    if denominator.__class__ == Interrogation:
        try:
            denominator = denominator.results
        except AttributeError:
            denominator = denominator.totals

    if denominator is not False and not isinstance(denominator, STRINGTYPE):
        df2 = denominator.copy()
        using_totals = True
        if isinstance(df2, DataFrame):
            if len(df2.columns) > 1:
                single_totals = False
            else:
                df2 = Series(df2.iloc[:, 0])
        elif isinstance(df2, Series):
            single_totals = True
            #if operation == 'k':
            #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?')
    else:
        if operation in ['k', 'a', '%', '/', '*', '-', '+']:
            denominator = 'self'
        if denominator == 'self':
            outputmode = True

    if operation.startswith('a') or operation.startswith('A'):
        if list(df.columns)[0] != '0' and list(df.columns)[0] != 0:
            df = df.T
        if using_totals:
            if not single_totals:
                df2 = df2.T

    if projection:
        # projection shouldn't do anything when working with '%', remember.
        df = projector(df, projection)
        if using_totals:
            df2 = projector(df2, projection)

    if spelling:
        df = convert_spell(df, convert_to=spelling)
        df = merge_duplicates(df, print_info=False)

        if not single_totals:
            df2 = convert_spell(df2, convert_to=spelling, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not df1_istotals:
            sort_by = 'total'

    if replace_names:
        df = name_replacer(df, replace_names)
        df = merge_duplicates(df)
        if not single_totals:
            df2 = name_replacer(df2, replace_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not sort_by:
            sort_by = 'total'

    if replace_subcorpus_names:
        df = name_replacer(df.T, replace_subcorpus_names)
        df = merge_duplicates(df).T
        df = df.sort_index()
        if not single_totals:
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = name_replacer(df2, replace_subcorpus_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = df2.sort_index()
        if not sort_by:
            sort_by = 'total'

    # remove old stats if they're there:
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        df = df.drop(statfields, axis=0)
    except:
        pass
    if using_totals:
        try:
            df2 = df2.drop(statfields, axis=0)
        except:
            pass

    # remove totals and tkinter order
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2,
                        [0, 1, 0, 1]):
        if name == 'Total' and df1_istotals:
            continue
        try:
            df = df.drop(name, axis=ax, errors='ignore')
        except:
            pass
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2,
                        [0, 1, 0, 1]):
        if name == 'Total' and single_totals:
            continue

        try:
            df2 = df2.drop(name, axis=ax, errors='ignore')
        except:
            pass

    df = skip_keep_merge_span(df)
    try:
        df2 = skip_keep_merge_span(df2)
    except:
        pass

    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    if just_totals:
        df = DataFrame(df.sum(), columns=['Combined total'])
        if using_totals:
            if not single_totals:
                df2 = DataFrame(df2.sum(), columns=['Combined total'])
            else:
                df2 = df2.sum()

    tots = df.sum(axis=1)

    if using_totals or outputmode:
        if not operation.startswith('k'):
            tshld = 0
            # set a threshold if just_totals
            if outputmode is True:
                df2 = df.T.sum()
                if not just_totals:
                    df2.name = 'Total'
                else:
                    df2.name = 'Combined total'
                using_totals = True
                single_totals = True
            if just_totals:
                if not single_totals:
                    tshld = set_threshold(df2, threshold, prinf=print_info)
            df, tots = combiney(df,
                                df2,
                                operation=operation,
                                threshold=tshld,
                                prinf=print_info)

    # if doing keywording...
    if operation.startswith('k'):

        if isinstance(denominator, STRINGTYPE):
            if denominator == 'self':
                df2 = df.copy()
            else:
                df2 = denominator

        from corpkit.keys import keywords
        df = keywords(df,
                      df2,
                      selfdrop=selfdrop,
                      threshold=threshold,
                      print_info=print_info,
                      editing=True,
                      calc_all=calc_all,
                      sort_by=sort_by,
                      measure=keyword_measure,
                      **kwargs)

    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    # resort data
    if sort_by or keep_stats:
        df = resort(df, keep_stats=keep_stats, sort_by=sort_by)
        if isinstance(df, bool):
            if df is False:
                return 'linregress'

    if keep_top:
        if not just_totals:
            df = df[list(df.columns)[:keep_top]]
        else:
            df = df.head(keep_top)

    if just_totals:
        # turn just_totals into series:
        df = Series(df['Combined total'], name='Combined total')

    if df1_istotals:
        if operation.startswith('k'):
            try:
                df = Series(df.ix[dataframe1.name])
                df.name = '%s: keyness' % df.name
            except:
                df = df.iloc[0, :]
                df.name = 'keyness' % df.name

    # generate totals branch if not percentage results:
    # fix me
    if df1_istotals or operation.startswith('k'):
        if not just_totals:
            try:
                total = Series(df['Total'], name='Total')
            except:
                total = 'none'
                pass

            #total = df.copy()
        else:
            total = 'none'
    else:
        # might be wrong if using division or something...
        try:
            total = df.T.sum(axis=1)
        except:
            total = 'none'

    if not isinstance(tots, DataFrame) and not isinstance(tots, Series):
        total = df.sum(axis=1)
    else:
        total = tots

    if isinstance(df, DataFrame):
        if df.empty:
            datatype = 'object'
        else:
            datatype = df.iloc[0].dtype
    else:
        datatype = df.dtype
    locs['datatype'] = datatype

    # TURN INT COL NAMES INTO STR
    try:
        df.results.columns = [str(d) for d in list(df.results.columns)]
    except:
        pass

    def add_tkt_index(df):
        """add an order for tkintertable if using gui"""
        if isinstance(df, Series):
            df = df.T
            df = df.drop('tkintertable-order', errors='ignore', axis=0)
            df = df.drop('tkintertable-order', errors='ignore', axis=1)
            dat = [i for i in range(len(df.index))]
            df['tkintertable-order'] = Series(dat, index=list(df.index))
            df = df.T
        return df

    # while tkintertable can't sort rows
    if checkstack('tkinter'):
        df = add_tkt_index(df)

    if kwargs.get('df1_always_df'):
        if isinstance(df, Series):
            df = DataFrame(df)

    # delete non-appearing conc lines
    lns = None
    if isinstance(getattr(interrogation, 'concordance', None), Concordance):
        try:
            col_crit = interrogation.concordance['m'].map(
                lambda x: x in list(df.columns))
            ind_crit = interrogation.concordance['c'].map(
                lambda x: x in list(df.index))
            lns = interrogation.concordance[col_crit]
            lns = lns.loc[ind_crit]
            lns = Concordance(lns)
        except ValueError:
            lns = None

    output = Interrogation(results=df,
                           totals=total,
                           query=locs,
                           concordance=lns)

    if print_info:
        print('***Done!***\n========================\n')

    return output
예제 #3
0
파일: editor.py 프로젝트: javelir/corpkit
def editor(interrogation, 
           operation=None,
           denominator=False,
           sort_by=False,
           keep_stats=False,
           keep_top=False,
           just_totals=False,
           threshold='medium',
           just_entries=False,
           skip_entries=False,
           merge_entries=False,
           just_subcorpora=False,
           skip_subcorpora=False,
           span_subcorpora=False,
           merge_subcorpora=False,
           replace_names=False,
           replace_subcorpus_names=False,
           projection=False,
           remove_above_p=False,
           p=0.05, 
           print_info=False,
           spelling=False,
           selfdrop=True,
           calc_all=True,
           keyword_measure='ll',
           **kwargs
          ):
    """
    See corpkit.interrogation.Interrogation.edit() for docstring
    """

    # grab arguments, in case we get dict input and have to iterate
    locs = locals()

    import corpkit

    import re
    import collections
    import pandas as pd
    import numpy as np

    from pandas import DataFrame, Series
    from time import localtime, strftime
    
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        have_ipython = False
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    # to use if we also need to worry about concordance lines
    return_conc = False

    from corpkit.interrogation import Interrodict, Interrogation, Concordance
    if interrogation.__class__ == Interrodict:
        locs.pop('interrogation', None)
        from collections import OrderedDict
        outdict = OrderedDict()
        for i, (k, v) in enumerate(interrogation.items()):
            # only print the first time around
            if i != 0:
                locs['print_info'] = False

            if isinstance(denominator, STRINGTYPE) and denominator.lower() == 'self':
                denominator = interrogation

            # if df2 is also a dict, get the relevant entry

            if isinstance(denominator, (dict, Interrodict)):
                #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \
                #   sorted(set([i.lower() for i in list(denominator.keys())])):
                #   locs['denominator'] = denominator[k]

                # fix: this repeats itself for every key, when it doesn't need to
                # denominator_sum: 
                if kwargs.get('denominator_sum'):
                    locs['denominator'] = denominator.collapse(axis='key')

                if kwargs.get('denominator_totals'):
                    locs['denominator'] = denominator[k].totals
                else:
                    locs['denominator'] = denominator[k].results


            outdict[k] = v.results.edit(**locs)
        if print_info:
            
            thetime = strftime("%H:%M:%S", localtime())
            print("\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (thetime, "'\n         '".join(sorted(outdict.keys()))))
        return Interrodict(outdict)

    elif isinstance(interrogation, (DataFrame, Series)):
        dataframe1 = interrogation
    elif isinstance(interrogation, Interrogation):
        #if interrogation.__dict__.get('concordance', None) is not None:
        #    concordances = interrogation.concordance
        branch = kwargs.pop('branch', 'results')
        if branch.lower().startswith('r') :
            dataframe1 = interrogation.results
        elif branch.lower().startswith('t'):
            dataframe1 = interrogation.totals
        elif branch.lower().startswith('c'):
            dataframe1 = interrogation.concordance
            return_conc = True
        else:
            dataframe1 = interrogation.results
    
    elif isinstance(interrogation, Concordance) or \
                        all(x in list(dataframe1.columns) for x in [ 'l', 'm', 'r']):
        return_conc = True
        print('heree')
        dataframe1 = interrogation
    # hope for the best
    else:
        dataframe1 = interrogation

    the_time_started = strftime("%Y-%m-%d %H:%M:%S")

    pd.options.mode.chained_assignment = None

    try:
        from process import checkstack
    except ImportError:
        from corpkit.process import checkstack
        
    if checkstack('pythontex'):
        print_info=False

    def combiney(df, df2, operation='%', threshold='medium', prinf=True):
        """mash df and df2 together in appropriate way"""
        totals = False
        # delete under threshold
        if just_totals:
            if using_totals:
                if not single_totals:
                    to_drop = list(df2[df2['Combined total'] < threshold].index)
                    df = df.drop([e for e in to_drop if e in list(df.index)])
                    if prinf:
                        to_show = []
                        [to_show.append(w) for w in to_drop[:5]]
                        if len(to_drop) > 10:
                            to_show.append('...')
                            [to_show.append(w) for w in to_drop[-5:]]
                        if len(to_drop) > 0:
                            print('Removing %d entries below threshold:\n    %s' % (len(to_drop), '\n    '.join(to_show)))
                        if len(to_drop) > 10:
                            print('... and %d more ... \n' % (len(to_drop) - len(to_show) + 1))
                        else:
                            print('')
                else:
                    denom = df2
        else:
            denom = list(df2)
        if single_totals:
            if operation == '%':
                totals = df.sum() * 100.0 / float(df.sum().sum())
                df = df * 100.0
                try:
                    df = df.div(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '+':
                try:
                    df = df.add(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '-':
                try:
                    df = df.sub(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '*':
                totals = df.sum() * float(df.sum().sum())
                try:
                    df = df.mul(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '/':
                try:
                    totals = df.sum() / float(df.sum().sum())
                    df = df.div(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)

            elif operation == 'a':
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2
            
            elif operation.startswith('c'):
                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    df = pandas.concat([df, df2], axis=1)
            return df, totals

        elif not single_totals:
            if not operation.startswith('a'):
                # generate totals
                if operation == '%':
                    totals = df.sum() * 100.0 / float(df2.sum().sum())
                if operation == '*':
                    totals = df.sum() * float(df2.sum().sum())
                if operation == '/':
                    totals = df.sum() / float(df2.sum().sum())
                if operation.startswith('c'):
                    # add here the info that merging will not work 
                    # with identical colnames
                    import warnings
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        d = pd.concat([df.T, df2.T])
                        # make index nums
                        d = d.reset_index()
                        # sum and remove duplicates
                        d = d.groupby('index').sum()
                        dx = d.reset_index('index')
                        dx.index = list(dx['index'])
                        df = dx.drop('index', axis=1).T

                def editf(datum):
                    meth = {'%': datum.div,
                            '*': datum.mul,
                            '/': datum.div,
                            '+': datum.add,
                            '-': datum.sub}

                    if datum.name in list(df2.columns):

                        method = meth[operation]
                        mathed = method(df2[datum.name], fill_value=0.0)
                        if operation == '%':
                            return mathed * 100.0
                        else:
                            return mathed
                    else:
                        return datum * 0.0

                df = df.apply(editf)

            else:
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2.T.sum()

        return df, totals

    def parse_input(df, the_input):
        """turn whatever has been passed in into list of words that can 
           be used as pandas indices---maybe a bad way to go about it"""
        parsed_input = False
        import re
        if the_input == 'all':
            the_input = r'.*'
        if isinstance(the_input, int):
            try:
                the_input = str(the_input)
            except:
                pass
            the_input = [the_input]
        elif isinstance(the_input, STRINGTYPE):
            regex = re.compile(the_input)
            parsed_input = [w for w in list(df) if re.search(regex, w)]
            return parsed_input
        from corpkit.dictionaries.process_types import Wordlist
        if isinstance(the_input, Wordlist) or the_input.__class__ == Wordlist:
            the_input = list(the_input)
        if isinstance(the_input, list):
            if isinstance(the_input[0], int):
                parsed_input = [word for index, word in enumerate(list(df)) if index in the_input]
            elif isinstance(the_input[0], STRINGTYPE):
                try:
                    parsed_input = [word for word in the_input if word in df.columns]
                except AttributeError: # if series
                    parsed_input = [word for word in the_input if word in df.index]
        return parsed_input

    def synonymise(df, pos='n'):
        """pass a df and a pos and convert df columns to most common synonyms"""
        from nltk.corpus import wordnet as wn
        #from dictionaries.taxonomies import taxonomies
        from collections import Counter
        fixed = []
        for w in list(df.columns):
            try:
                syns = []
                for syns in wn.synsets(w, pos=pos):
                    for w in syns:
                        synonyms.append(w)
                top_syn = Counter(syns).most_common(1)[0][0]
                fixed.append(top_syn)
            except:
                fixed.append(w)
        df.columns = fixed
        return df

    def convert_spell(df, convert_to='US', print_info=print_info):
        """turn dataframes into us/uk spelling"""
        from dictionaries.word_transforms import usa_convert
        if print_info:
            print('Converting spelling ... \n')
        if convert_to == 'UK':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        fixed = []
        for val in list(df.columns):
            try:
                fixed.append(usa_convert[val])
            except:
                fixed.append(val)
        df.columns = fixed
        return df

    def merge_duplicates(df, print_info=print_info):
        if print_info:
            print('Merging duplicate entries ... \n')
        # now we have to merge all duplicates
        for dup in df.columns.get_duplicates():
            #num_dupes = len(list(df[dup].columns))
            temp = df[dup].sum(axis=1)
            #df = df.drop([dup for d in range(num_dupes)], axis=1)
            df = df.drop(dup, axis=1)
            df[dup] = temp
        return df

    def name_replacer(df, replace_names, print_info=print_info):
        """replace entry names and merge"""
        import re
        # get input into list of tuples
        # if it's a string, we want to delete it
        if isinstance(replace_names, STRINGTYPE):
            replace_names = [(replace_names, '')]
        # this is for some malformed list
        if not isinstance(replace_names, dict):
            if isinstance(replace_names[0], STRINGTYPE):
                replace_names = [replace_names]
        # if dict, make into list of tupes
        if isinstance(replace_names, dict):
            replace_names = [(v, k) for k, v in replace_names.items()]
        for to_find, replacement in replace_names:
            if print_info:
                if replacement:
                    print('Replacing "%s" with "%s" ...\n' % (to_find, replacement))
                else:
                    print('Deleting "%s" from entry names ...\n' % to_find)
            to_find = re.compile(to_find)
            if not replacement:
                replacement = ''
            df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)]
        df = merge_duplicates(df, print_info=False)
        return df

    def just_these_entries(df, parsed_input, prinf=True):
        entries = [word for word in list(df) if word not in parsed_input]
        if prinf:
            print('Keeping %d entries:\n    %s' % \
                (len(parsed_input), '\n    '.join(parsed_input[:10])))
            if len(parsed_input) > 10:
                print('... and %d more ... \n' % (len(parsed_input) - 10))
            else:
                print('')
        df = df.drop(entries, axis=1)
        return df

    def skip_these_entries(df, parsed_input, prinf=True):
        if prinf:     
            print('Skipping %d entries:\n    %s' % \
                (len(parsed_input), '\n    '.join(parsed_input[:10])))
            if len(parsed_input) > 10:
                print('... and %d more ... \n' % (len(parsed_input) - 10))
            else:
                print('')
        df = df.drop(parsed_input, axis=1)
        return df

    def newname_getter(df, parsed_input, newname='combine', prinf=True, merging_subcorpora=False):
        """makes appropriate name for merged entries"""
        if merging_subcorpora:
            if newname is False:
                newname = 'combine'
        if isinstance(newname, int):
            the_newname = list(df.columns)[newname]
        elif isinstance(newname, STRINGTYPE):
            if newname == 'combine':
                if len(parsed_input) <= 3:
                    the_newname = '/'.join(parsed_input)
                elif len(parsed_input) > 3:
                    the_newname = '/'.join(parsed_input[:3]) + '...'
            else:
                the_newname = newname
        if not newname:
            # revise this code
            import operator
            sumdict = {}
            for item in parsed_input:
                summed = sum(list(df[item]))
                sumdict[item] = summed
            the_newname = max(iter(sumdict.items()), key=operator.itemgetter(1))[0]
        if not isinstance(the_newname, STRINGTYPE):
            the_newname = str(the_newname, errors='ignore')
        return the_newname

    def merge_these_entries(df, parsed_input, the_newname, prinf=True, merging='entries'):
        # make new entry with sum of parsed input
        if len(parsed_input) == 0:
            import warnings
            warnings.warn('No %s could be automatically merged.\n' % merging)
        else:
            if prinf:
                print('Merging %d %s as "%s":\n    %s' % \
                    (len(parsed_input), merging, the_newname, '\n    '.join(parsed_input[:10])))
                if len(parsed_input) > 10:
                    print('... and %d more ... \n' % (len(parsed_input) - 10))
                else:
                    print('')
        # remove old entries
        temp = sum([df[i] for i in parsed_input])

        if isinstance(df, Series):
            df = df.drop(parsed_input, errors='ignore')
            nms = list(df.index)
        else:
            df = df.drop(parsed_input, axis=1, errors='ignore')
            nms = list(df.columns)
        if the_newname in nms:
            df[the_newname] = df[the_newname] + temp
        else:
            df[the_newname] = temp
        return df

    def just_these_subcorpora(df, lst_of_subcorpora, prinf=True):        
        if isinstance(lst_of_subcorpora[0], int):
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if prinf:
            print('Keeping %d subcorpora:\n    %s' % (len(good_years), '\n    '.join(good_years[:10])))
            if len(good_years) > 10:
                print('... and %d more ... \n' % (len(good_years) - 10))
            else:
                print('')
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis=0)
        return df

    def skip_these_subcorpora(df, lst_of_subcorpora, prinf=True):
        if isinstance(lst_of_subcorpora, int):
            lst_of_subcorpora = [lst_of_subcorpora]
        if isinstance(lst_of_subcorpora[0], int):
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if len(bad_years) == 0:
            import warnings
            warnings.warn('No subcorpora skipped.\n')
        else:
            if prinf:       
                print('Skipping %d subcorpora:\n    %s' % (len(bad_years), '\n    '.join([str(i) for i in bad_years[:10]])))
                if len(bad_years) > 10:
                    print('... and %d more ... \n' % (len(bad_years) - 10))
                else:
                    print('')
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis=0)
        return df

    def span_these_subcorpora(df, lst_of_subcorpora, prinf=True):
        """select only a span of suborpora (first, last)"""

        fir, sec = lst_of_subcorpora
        if len(lst_of_subcorpora) == 0:
            import warnings
            warnings.warn('Span not identified.\n')
        else:        
            if prinf:        
                print('Keeping subcorpora:\n    %d--%d\n' % (int(fir), int(sec)))
        sbs = list(df.index)
        df = df.ix[sbs.index(fir):sbs.index(sec) + 1]

        return df

    def projector(df, list_of_tuples, prinf=True):
        """project abs values"""
        if isinstance(list_of_tuples, list):
            tdict = {}
            for a, b in list_of_tuples:
                tdict[a] = b
            list_of_tuples = tdict
        for subcorpus, projection_value in list(list_of_tuples.items()):
            if isinstance(subcorpus, int):
                subcorpus = str(subcorpus)
            df.ix[subcorpus] = df.ix[subcorpus] * projection_value
            if prinf:
                if isinstance(projection_value, float):
                    print('Projection: %s * %s' % (subcorpus, projection_value))
                if isinstance(projection_value, int):
                    print('Projection: %s * %d' % (subcorpus, projection_value))
        if prinf:
            print('')
        return df

    def do_stats(df):
        """do linregress and add to df"""
        try: 
            from scipy.stats import linregress
        except ImportError:
            
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: sort type not available in this verion of corpkit.' % thetime)
            return False

        indices = list(df.index)
        first_year = list(df.index)[0]
        try:
            x = [int(y) - int(first_year) for y in indices]
        except ValueError:
            x = list(range(len(indices)))
        
        statfields = ['slope', 'intercept', 'r', 'p', 'stderr']

        stats = []
        if isinstance(df, Series):
            y = list(df.values)
            sl = Series(list(linregress(x, y)), index=statfields)

        else:    
            for entry in list(df.columns):
                y = list(df[entry])
                stats.append(list(linregress(x, y)))
            sl = DataFrame(zip(*stats), index=statfields, columns=list(df.columns))
        df = df.append(sl)
        
        # drop infinites and nans
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)
        return df

    def resort(df, sort_by = False, keep_stats = False):
        """sort results, potentially using scipy's linregress"""
        
        # translate options and make sure they are parseable
        stat_field = ['slope', 'intercept', 'r', 'p', 'stderr']
        easy_sorts = ['total', 'infreq', 'name', 'most', 'least']
        stat_sorts = ['increase', 'decrease', 'static', 'turbulent']
        options = stat_field + easy_sorts + stat_sorts
        sort_by_convert = {'most': 'total', True: 'total', 'least': 'infreq'}
        sort_by = sort_by_convert.get(sort_by, sort_by)

        # probably broken :(
        if just_totals:
            if sort_by == 'name':
                return df.sort_index()
            else:
                return df.sort_values(by='Combined total', ascending=sort_by != 'total', axis=1)

        stats_done = False
        if keep_stats or sort_by in stat_field + stat_sorts:
            df = do_stats(df)
            stats_done = True
            if isinstance(df, bool):
                if df is False:
                    return False
        
        if isinstance(df, Series):
            if stats_done:
                stats = df.ix[range(-5, 0)]
                df = df.drop(list(stats.index))
            if sort_by == 'name':
                df = df.sort_index()
            else:
                df = df.sort_values(ascending=sort_by != 'total')
            if stats_done:
                df = df.append(stats)
            return df

        if sort_by == 'name':
            # currently case sensitive
            df = df.reindex_axis(sorted(df.columns), axis=1)
        elif sort_by in ['total', 'infreq']:
            if df1_istotals:
                df = df.T
            df = df[list(df.sum().sort_values(ascending=sort_by != 'total').index)]
        
        # sort by slope etc., or search by subcorpus name
        if sort_by in stat_field or sort_by not in options:
            asc = kwargs.get('reverse', False)
            df = df.T.sort_values(by=sort_by, ascending=asc).T
        
        if sort_by in ['increase', 'decrease', 'static', 'turbulent']:
            slopes = df.ix['slope']
            if sort_by == 'increase':
                df = df[slopes.argsort()[::-1]]
            elif sort_by == 'decrease':
                df = df[slopes.argsort()]
            elif sort_by == 'static':
                df = df[slopes.abs().argsort()]
            elif sort_by == 'turbulent':
                df = df[slopes.abs().argsort()[::-1]]
            if remove_above_p:
                df = df.T
                df = df[df['p'] <= p]
                df = df.T

        # remove stats field by default
        if not keep_stats:
            df = df.drop(stat_field, axis=0, errors='ignore')
        return df

    def set_threshold(big_list, threshold, prinf=True):
        if isinstance(threshold, STRINGTYPE):
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500
            if isinstance(big_list, DataFrame):
                tot = big_list.sum().sum()

            if isinstance(big_list, Series):
                tot = big_list.sum()
            tshld = float(tot) / float(denominator)
        else:
            tshld = threshold
        if prinf:
            print('Threshold: %d\n' % tshld)
        return tshld

    # copy dataframe to be very safe
    df = dataframe1.copy()
    # make cols into strings
    try:
        df.columns = [str(c) for c in list(df.columns)]
    except:
        pass

    if operation is None:
        operation = 'None'

    if isinstance(interrogation, Concordance):
        return_conc = True
    # do concordance work
    if return_conc:
        if just_entries:
            if isinstance(just_entries, int):
                just_entries = [just_entries]
            if isinstance(just_entries, STRINGTYPE):
                df = df[df['m'].str.contains(just_entries)]
            if isinstance(just_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in just_entries):
                    mp = df['m'].map(lambda x: x in just_entries)
                    df = df[mp]
                else:
                    df = df.ix[just_entries]

        if skip_entries:
            if isinstance(skip_entries, int):
                skip_entries = [skip_entries]
            if isinstance(skip_entries, STRINGTYPE):
                df = df[~df['m'].str.contains(skip_entries)]
            if isinstance(skip_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_entries):
                    mp = df['m'].map(lambda x: x not in skip_entries)
                    df = df[mp]
                else:
                    df = df.drop(skip_entries, axis=0)

        if just_subcorpora:
            if isinstance(just_subcorpora, int):
                just_subcorpora = [just_subcorpora]
            if isinstance(just_subcorpora, STRINGTYPE):
                df = df[df['c'].str.contains(just_subcorpora)]
            if isinstance(just_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in just_subcorpora):
                    mp = df['c'].map(lambda x: x in just_subcorpora)
                    df = df[mp]
                else:
                    df = df.ix[just_subcorpora]

        if skip_subcorpora:
            if isinstance(skip_subcorpora, int):
                skip_subcorpora = [skip_subcorpora]
            if isinstance(skip_subcorpora, STRINGTYPE):
                df = df[~df['c'].str.contains(skip_subcorpora)]
            if isinstance(skip_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_subcorpora):
                    mp = df['c'].map(lambda x: x not in skip_subcorpora)
                    df = df[mp]
                else:
                    df = df.drop(skip_subcorpora, axis=0)

        return Concordance(df)

    if print_info:
        print('\n***Processing results***\n========================\n')

    df1_istotals = False
    if isinstance(df, Series):
        df1_istotals = True
        df = DataFrame(df)
        # if just a single result
    else:
        df = DataFrame(df)
    if operation.startswith('k'):
        if sort_by is False:
            if not df1_istotals:
                sort_by = 'turbulent'
        if df1_istotals:
            df = df.T
    
    # figure out if there's a second list
    # copy and remove totals if there is
    single_totals = True
    using_totals = False
    outputmode = False

    if denominator.__class__ == Interrogation:
        try:
            denominator = denominator.results
        except AttributeError:
            denominator = denominator.totals

    if denominator is not False and not isinstance(denominator, STRINGTYPE):
        df2 = denominator.copy()
        using_totals = True
        if isinstance(df2, DataFrame):
            if len(df2.columns) > 1:
                single_totals = False
            else:
                df2 = Series(df2)
        elif isinstance(df2, Series):
            single_totals = True
            #if operation == 'k':
                #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?')
    else:
        if operation in ['k', 'a', '%', '/', '*', '-', '+']:
            denominator = 'self'         
        if denominator == 'self':
            outputmode = True

    if operation.startswith('a') or operation.startswith('A'):
        if list(df.columns)[0] != '0' and list(df.columns)[0] != 0:
            df = df.T
        if using_totals:
            if not single_totals:
                df2 = df2.T

    if projection:
        # projection shouldn't do anything when working with '%', remember.
        df = projector(df, projection)
        if using_totals:
            df2 = projector(df2, projection)

    if spelling:
        df = convert_spell(df, convert_to=spelling)
        df = merge_duplicates(df, print_info=False)

        if not single_totals:
            df2 = convert_spell(df2, convert_to=spelling, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not df1_istotals:
            sort_by = 'total'

    if replace_names:
        df = name_replacer(df, replace_names)
        df = merge_duplicates(df)
        if not single_totals:
            df2 = name_replacer(df2, replace_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not sort_by:
            sort_by = 'total'

    if replace_subcorpus_names:
        df = name_replacer(df.T, replace_subcorpus_names)
        df = merge_duplicates(df).T
        df = df.sort_index()
        if not single_totals:
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = name_replacer(df2, replace_subcorpus_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = df2.sort_index()
        if not sort_by:
            sort_by = 'total'

    # remove old stats if they're there:
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        df = df.drop(statfields, axis=0)
    except:
        pass
    if using_totals:
        try:
            df2 = df2.drop(statfields, axis=0)
        except:
            pass

    # remove totals and tkinter order
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
        if name == 'Total' and df1_istotals:
            continue
        try:
            df = df.drop(name, axis=ax, errors='ignore')
        except:
            pass
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
        if name == 'Total' and single_totals:
            continue

        try:

            df2 = df2.drop(name, axis=ax, errors='ignore')
        except:
            pass

    # merging: make dicts if they aren't already, so we can iterate
    if merge_entries:
        if not isinstance(merge_entries, list):
            if isinstance(merge_entries, STRINGTYPE):
                merge_entries = {'combine': merge_entries}
            # for newname, criteria    
            for name, the_input in sorted(merge_entries.items()):
                pin = parse_input(df, the_input)
                the_newname = newname_getter(df, pin, newname=name, prinf=print_info)
                df = merge_these_entries(df, pin, the_newname, prinf=print_info)
                if not single_totals:
                    pin2 = parse_input(df2, the_input)
                    df2 = merge_these_entries(df2, pin2, the_newname, prinf=False)
        else:
            for i in merge_entries:
                pin = parse_input(df, merge_entries)
                the_newname = newname_getter(df, pin, prinf=print_info)
                df = merge_these_entries(df, pin, the_newname, prinf=print_info)
                if not single_totals:
                    pin2 = parse_input(df2, merge_entries)
                    df2 = merge_these_entries(df2, pin2, the_newname, prinf=False)
    
    if merge_subcorpora:
        if not isinstance(merge_subcorpora, dict):
            if isinstance(merge_subcorpora, list):
                if isinstance(merge_subcorpora[0], tuple):
                    merge_subcorpora = {x: y for x, y in merge_subcorpora}
                elif isinstance(merge_subcorpora[0], STRINGTYPE):
                    merge_subcorpora = {'combine': [x for x in merge_subcorpora]}
                elif isinstance(merge_subcorpora[0], int):
                    merge_subcorpora = {'combine': [str(x) for x in merge_subcorpora]}
            else:
                merge_subcorpora = {'combine': merge_subcorpora}
        for name, the_input in sorted(merge_subcorpora.items()):
            pin = parse_input(df.T, the_input)
            the_newname = newname_getter(df.T, pin, newname=name, \
                merging_subcorpora=True, prinf=print_info)
            df = merge_these_entries(df.T, pin, the_newname, merging='subcorpora', 
                                     prinf=print_info).T
            if using_totals:
                pin2 = parse_input(df2.T, the_input)
                df2 = merge_these_entries(df2.T, pin2, the_newname, merging='subcorpora', 
                                          prinf=False).T

    if just_subcorpora:
        df = just_these_subcorpora(df, just_subcorpora, prinf=print_info)
        if using_totals:
            df2 = just_these_subcorpora(df2, just_subcorpora, prinf=False)
    
    if skip_subcorpora:
        df = skip_these_subcorpora(df, skip_subcorpora, prinf=print_info)
        if using_totals:
            df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf=False)
    
    if span_subcorpora:
        df = span_these_subcorpora(df, span_subcorpora, prinf=print_info)
        if using_totals:
            df2 = span_these_subcorpora(df2, span_subcorpora, prinf=False)

    if just_entries:
        df = just_these_entries(df, parse_input(df, just_entries), prinf=print_info)
        if not single_totals:
            df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf=False)
    
    if skip_entries:
        df = skip_these_entries(df, parse_input(df, skip_entries), prinf=print_info)
        if not single_totals:
            df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf=False)

    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    if just_totals:
        df = DataFrame(df.sum(), columns=['Combined total'])
        if using_totals:
            if not single_totals:
                df2 = DataFrame(df2.sum(), columns=['Combined total'])
            else:
                df2 = df2.sum()

    tots = df.sum(axis=1)

    if using_totals or outputmode:
        if not operation.startswith('k'):
            tshld = 0
            # set a threshold if just_totals
            if outputmode is True:
                df2 = df.T.sum()
                if not just_totals:
                    df2.name = 'Total'
                else:
                    df2.name = 'Combined total'
                using_totals = True
                single_totals = True
            if just_totals:
                if not single_totals:
                    tshld = set_threshold(df2, threshold, prinf=print_info)
            df, tots = combiney(df, df2, operation=operation, threshold=tshld, prinf=print_info)
    
    # if doing keywording...
    if operation.startswith('k'):

        if isinstance(denominator, STRINGTYPE):
            if denominator == 'self':
                df2 = df.copy()
            else:
                df2 = denominator

        from corpkit.keys import keywords
        df = keywords(df, df2, 
                      selfdrop=selfdrop, 
                      threshold=threshold, 
                      print_info=print_info,
                      editing=True,
                      calc_all=calc_all,
                      sort_by=sort_by,
                      measure=keyword_measure,
                      **kwargs)
    
    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    # resort data
    if sort_by or keep_stats:
        df = resort(df, keep_stats=keep_stats, sort_by=sort_by)
        if isinstance(df, bool):
            if df is False:
                return 'linregress'

    if keep_top:
        if not just_totals:
            df = df[list(df.columns)[:keep_top]]
        else:
            df = df.head(keep_top)

    if just_totals:
        # turn just_totals into series:
        df = Series(df['Combined total'], name='Combined total')

    if df1_istotals:
        if operation.startswith('k'):
            try:
                df = Series(df.ix[dataframe1.name])
                df.name = '%s: keyness' % df.name
            except:
                df = df.iloc[0, :]
                df.name = 'keyness' % df.name

    # generate totals branch if not percentage results:
    # fix me
    if df1_istotals or operation.startswith('k'):
        if not just_totals:
            try:
                total = Series(df['Total'], name='Total')
            except:
                total = 'none'
                pass

            #total = df.copy()
        else:
            total = 'none'
    else:
        # might be wrong if using division or something...
        try:
            total = df.T.sum(axis=1)
        except:
            total = 'none'
    
    if not isinstance(tots, DataFrame) and not isinstance(tots, Series):
        total = df.sum(axis=1)
    else:
        total = tots

    if isinstance(df, DataFrame):
        datatype = df.iloc[0].dtype
    else:
        datatype = df.dtype
    locs['datatype'] = datatype

    # TURN INT COL NAMES INTO STR
    try:
        df.results.columns = [str(d) for d in list(df.results.columns)]
    except:
        pass

    def add_tkt_index(df):
        """add an order for tkintertable if using gui"""
        if isinstance(df, Series):
            df = df.T
            df = df.drop('tkintertable-order', errors='ignore', axis=0)
            df = df.drop('tkintertable-order', errors='ignore', axis=1)
            dat = [i for i in range(len(df.index))]
            df['tkintertable-order'] = Series(dat, index=list(df.index))
            df = df.T
        return df

    # while tkintertable can't sort rows
    if checkstack('tkinter'):
        df = add_tkt_index(df)

    if kwargs.get('df1_always_df'):
        if isinstance(df, Series):
            df = DataFrame(df)

    # delete non-appearing conc lines
    if not hasattr(interrogation, 'concordance'):
        lns = None
    elif hasattr(interrogation, 'concordance') and interrogation.concordance is None:
        lns = None
    else:
        col_crit = interrogation.concordance['m'].map(lambda x: x in list(df.columns))
        ind_crit = interrogation.concordance['c'].map(lambda x: x in list(df.index))
        lns = interrogation.concordance[col_crit]
        lns = lns.loc[ind_crit]
        lns = Concordance(lns)
    
    output = Interrogation(results=df, totals=total, query=locs, concordance=lns)

    if print_info:
        print('***Done!***\n========================\n')

    return output
예제 #4
0
파일: editor.py 프로젝트: agogear/corpkit
def editor(dataframe1, 
            operation = None,
            dataframe2 = False,
            sort_by = False,
            keep_stats = False,
            keep_top = False,
            just_totals = False,
            threshold = 'medium',
            just_entries = False,
            skip_entries = False,
            merge_entries = False,
            newname = 'combine',
            multiple_merge = False,
            just_subcorpora = False,
            skip_subcorpora = False,
            span_subcorpora = False,
            merge_subcorpora = False,
            new_subcorpus_name = False,
            replace_names = False,
            projection = False,
            remove_above_p = False,
            p = 0.05, 
            revert_year = True,
            print_info = True,
            spelling = False,
            selfdrop = True,
            calc_all = True,
            **kwargs
            ):
    """
    Edit results of interrogations, calculate slopes, do keywording, sort, etc.

    dataframe1:         list of results or totals to edit
    operation:          kind of maths to do on inputted lists:
                        '+', '-', '/', '*', '%': self explanatory
                        'k': log likelihood (keywords)
                        'a': get distance metric (for use with interrogator 'a' option)
                        'd': get percent difference (alternative approach to keywording)
    dataframe2:         list of results or totals
                        if list of results, for each entry in dataframe 1, locate
                        entry with same name in dataframe 2, and do maths there
    sort_by:            calculate slope, stderr, r, p values, then sort by:
                        increase: highest to lowest slope value
                        decrease: lowest to highest slope value
                        turbulent: most change in y axis values
                        static: least change in y axis values
                        total/most: largest number first
                        infreq/least: smallest number first
                        name: alphabetically
    keep_stats:         keep/drop stats values from dataframe after sorting
    keep_top:           after sorting, remove all but the top n results
    just_totals:        sum each column and work with sums
    threshold:          when using results list as dataframe 2, drop values occurring
                        fewer than n times. If not keywording, you can use:
                        'high': dataframe2 total / 2500
                        'medium': dataframe2 total / 5000
                        'low': dataframe2 total / 10000
                        If keywording, there are smaller default thresholds
    just_entries:       keep only entries specified using:
                        regex, list of indices, list of words
    skip_entries:       as above
    merge_entries:      as above, or can take a dictionary of:
                        {newname: regex/wordlist/indexlist}
    newname:            new name for merged entries
    just_subcorpora:    as above
    skip_subcorpora:    as above
    span_subcorpora:    If numerical subcorpus names, give a tuple and get all
                        numbers in between
    merge_subcorpora:   as above
    new_subcorpus_name: as above
    replace_names:      edit result names and then merge duplicate names. Use either:
                        a tuple: (r'regex-to-match', 'replacement text')
                        a string: a regex to delete
    projection:         a tuple of ('subcorpus-name', n) to multiply results by n
    remove_above_p:     delete any result over p
    p:                  set the p value
    revert_year:        when doing linear regression on years, turn them into 1, 2 ...
    print_info:         print stuff to console showing what's being edited
    spelling:           convert/normalise spelling:
                        'US' or 'UK'
    selfdrop:           when keywording, try to remove target corpus from reference corpus
    calc_all:           when keywording, calculate words that appear in either corpus
    """

    import pandas
    import pandas as pd
    import numpy as np
    import re
    import collections
    from pandas import DataFrame, Series
    from time import localtime, strftime
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        have_ipython = False
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    pd.options.mode.chained_assignment = None

    the_time_started = strftime("%Y-%m-%d %H:%M:%S")
    
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

    from corpkit.tests import check_pytex
    if check_pytex():
        print_info = False

    import signal
    def signal_handler(signal, frame):
        import signal
        """pause on ctrl+c, rather than just stop loop"""

        #def subsig(signal, frame):
        #    """exit on ctrl c"""
        #    import sys
        #    sys.exit(0)
        #
        #signal.signal(signal.SIGINT, subsig)
        
        import sys
        from time import localtime, strftime
        time = strftime("%H:%M:%S", localtime())
        sel = raw_input('\n\n%s: Paused. Press return to resume, or type exit to quit: \n' % time)
        if sel.startswith('e') or sel.startswith('E'):
            sys.exit(0)
        else:
            time = strftime("%H:%M:%S", localtime())
            print '%s: Interrogation resumed.\n' % time
    signal.signal(signal.SIGINT, signal_handler)

    def combiney(df, df2, operation = '%', threshold = 'medium', prinf = True):
        """mash df and df2 together in appropriate way"""
        totals = False
        # delete under threshold
        if just_totals:
            if using_totals:
                if not single_totals:
                    to_drop = list(df2[df2['Combined total'] < threshold].index)
                    df = df.drop([e for e in to_drop if e in list(df.index)])
                    if prinf:
                        to_show = []
                        [to_show.append(w) for w in to_drop[:5]]
                        if len(to_drop) > 10:
                            to_show.append('...')
                            [to_show.append(w) for w in to_drop[-5:]]
                        if len(to_drop) > 0:
                            print 'Removing %d entries below threshold:\n    %s' % (len(to_drop), '\n    '.join(to_show))
                        if len(to_drop) > 10:
                            print '... and %d more ... \n' % (len(to_drop) - len(to_show) + 1)
                        else:
                            print ''
                else:
                    denom = df2
        else:
            denom = list(df2)
        if single_totals:
            if operation == '%':
                totals = df.sum() * 100.0 / float(df.sum().sum())
                df = df * 100.0
                df = df.div(denom, axis = 0)
                
            elif operation == '+':
                df = df.add(denom, axis = 0)
            elif operation == '-':
                df = df.sub(denom, axis = 0)
            elif operation == '*':
                totals = df.sum() * float(df.sum().sum())
                df = df.mul(denom, axis = 0)
            elif operation == '/':
                totals = df.sum() / float(df.sum().sum())
                df = df.div(denom, axis = 0)
            elif operation == 'd':
                #df.ix['Combined total'] = df.sum()
                #to_drop = to_drop = list(df.T[df.T['Combined total'] < threshold].index)
                to_drop = [n for n in list(df.columns) if df[n].sum() < threshold]
                df = df.drop([e for e in to_drop if e in list(df.columns)], axis = 1)
                #df.drop('Combined total')
                if prinf:
                    to_show = []
                    [to_show.append(w) for w in to_drop[:5]]
                    if len(to_drop) > 10:
                        to_show.append('...')
                        [to_show.append(w) for w in to_drop[-5:]]
                    if len(to_drop) > 0:
                        print 'Removing %d entries below threshold:\n    %s' % (len(to_drop), '\n    '.join(to_show))
                    if len(to_drop) > 10:
                        print '... and %d more ... \n' % (len(to_drop) - len(to_show) + 1)
                    else:
                        print ''

                # get normalised num in target corpus
                norm_in_target = df.div(denom, axis = 0)
                # get normalised num in reference corpus, with or without selfdrop
                tot_in_ref = df.copy()
                for c in list(tot_in_ref.index):
                    if selfdrop:
                        tot_in_ref.ix[c] = df.sum() - tot_in_ref.ix[c]
                    else:
                        tot_in_ref.ix[c] = df.sum()
                norm_in_ref = tot_in_ref.div(df.sum().sum())
                df = (norm_in_target - norm_in_ref) / norm_in_ref * 100.0
                df = df.replace(float(-100.00), np.nan)

            elif operation == 'a':
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis = 1) / df2
            return df, totals

        elif not single_totals:
            if not operation.startswith('a'):
                # generate totals
                if operation == '%':
                    totals = df.sum() * 100.0 / float(df2.sum().sum())
                if operation == '*':
                    totals = df.sum() * float(df2.sum().sum())
                if operation == '/':
                    totals = df.sum() / float(df2.sum().sum())

                for index, entry in enumerate(list(df.columns)):
                    #p.animate(index)
                    if operation == '%':
                        try:
                            df[entry] = df[entry] * 100.0 / df2[entry]
                        except:
                            continue
                        #df.drop(entry, axis = 1, inplace = True)
                        #df[entry] = maths_done
                    elif operation == '+':
                        try:
                            df[entry] = df[entry] + df2[entry]
                        except:
                            continue
                    elif operation == '-':
                        try:
                            df[entry] = df[entry] - df2[entry]
                        except:
                            continue
                    elif operation == '*':
                        try:
                            df[entry] = df[entry] * df2[entry]
                        except:
                            continue
                    elif operation == '/':
                        try:
                            df[entry] = df[entry] / df2[entry]
                        except:
                            continue
            else:
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis = 1) / df2.T.sum()
            #p.animate(len(list(df.columns)))
            #if have_ipython:
                #clear_output()
        return df, totals

    def parse_input(df, input):
        """turn whatever has been passed in into list of words that can 
           be used as pandas indices---maybe a bad way to go about it"""
        import re
        if input == 'all':
            input = r'.*'
        if type(input) == int:
            try:
                input = str(input)
            except:
                pass
            input = [input]
        elif type(input) == str:
            try:
                regex = re.compile(input)
                parsed_input = [w for w in list(df) if re.search(regex, w)]
                return parsed_input
            except:
                input = [input]
        if type(input) == list:
            if type(input[0]) == int:
                parsed_input = [word for index, word in enumerate(list(df)) if index in input]
            elif type(input[0]) == str or type(input[0]) == unicode:
                try:
                    parsed_input = [word for word in input if word in df.columns]
                except AttributeError: # if series
                    parsed_input = [word for word in input if word in df.index]
        return parsed_input


    def synonymise(df, pos = 'n'):
        """pass a df and a pos and convert df columns to most common synonyms"""
        import nltk
        from nltk.corpus import wordnet as wn
        #from dictionaries.taxonomies import taxonomies
        from collections import Counter
        fixed = []
        for w in list(df.columns):
            try:
                syns = []
                for syns in wn.synsets(w, pos = pos):
                    for w in syns:
                        synonyms.append(w)
                top_syn = Counter(syns).most_common(1)[0][0]
                fixed.append(top_syn)
            except:
                fixed.append(w)
        df.columns = fixed
        return df

    def convert_spell(df, convert_to = 'US', print_info = print_info):
        """turn dataframes into us/uk spelling"""
        from dictionaries.word_transforms import usa_convert
        if print_info:
            print 'Converting spelling ... \n'
        if convert_to == 'UK':
            usa_convert = {v: k for k, v in usa_convert.items()}
        fixed = []
        for val in list(df.columns):
            try:
                fixed.append(usa_convert[val])
            except:
                fixed.append(val)
        df.columns = fixed
        return df

    def merge_duplicates(df, print_info = print_info):
        if print_info:
            print 'Merging duplicate entries ... \n'
        # now we have to merge all duplicates
        for dup in df.columns.get_duplicates():
            #num_dupes = len(list(df[dup].columns))
            temp = df[dup].sum(axis = 1)
            #df = df.drop([dup for d in range(num_dupes)], axis = 1)
            df = df.drop(dup, axis = 1)
            df[dup] = temp
        return df

    def name_replacer(df, replace_names, print_info = print_info):
        """replace entry names and merge"""
        import re        
        # double or single nest if need be
        if type(replace_names) == str:
            replace_names = [(replace_names, '')]
        if type(replace_names) != dict:
            if type(replace_names[0]) == str:
                replace_names = [replace_names]
        if type(replace_names) == dict:
            replace_names = [(v, k) for k, v in replace_names.items()]
        for to_find, replacement in replace_names:
            if print_info:
                try:
                    print 'Replacing "%s" with "%s" ...\n' % (to_find, replacement)
                except:
                    print 'Deleting "%s" from entry names ...\n' % (to_find)
            to_find = re.compile(to_find)
            try:
                replacement = replacement
            except:
                replacement = ''
            df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)]
        df = merge_duplicates(df, print_info = False)
        return df

    def just_these_entries(df, parsed_input, prinf = True):
        entries = [word for word in list(df) if word not in parsed_input]
        if prinf:
            print 'Keeping %d entries:\n    %s' % (len(parsed_input), '\n    '.join(parsed_input[:10]))
            if len(parsed_input) > 10:
                print '... and %d more ... \n' % (len(parsed_input) - 10)
            else:
                print ''
        df = df.drop(entries, axis = 1)
        return df

    def skip_these_entries(df, parsed_input, prinf = True):
        if prinf:     
            print 'Skipping %d entries:\n    %s' % (len(parsed_input), '\n    '.join(parsed_input[:10]))
            if len(parsed_input) > 10:
                print '... and %d more ... \n' % (len(parsed_input) - 10)
            else:
                print ''
        df = df.drop(parsed_input, axis = 1)
        return df

    def newname_getter(df, parsed_input, newname = 'combine', prinf = True, merging_subcorpora = False):
        """makes appropriate name for merged entries"""
        if merging_subcorpora:
            if newname is False:
                newname = 'combine'
        if type(newname) == int:
            the_newname = list(df.columns)[newname]
        elif type(newname) == str:
            if newname == 'combine':
                if len(parsed_input) <= 3:
                    the_newname = '/'.join(parsed_input)
                elif len(parsed_input) > 3:
                    the_newname = '/'.join(parsed_input[:3]) + '...'
            else:
                the_newname = newname
        if newname is False:
            # revise this code
            import operator
            sumdict = {}
            for item in parsed_input:
                summed = sum(list(df[item]))
                sumdict[item] = summed
            the_newname = max(sumdict.iteritems(), key=operator.itemgetter(1))[0]
        if type(the_newname) != unicode:
            the_newname = unicode(the_newname, errors = 'ignore')
        return the_newname

    def merge_these_entries(df, parsed_input, the_newname, prinf = True, merging = 'entries'):
        # make new entry with sum of parsed input
        if len(parsed_input) == 0:
            import warnings
            warnings.warn('No %s could be automatically merged.\n' % merging)
        else:
            if prinf:
                print 'Merging %d %s as "%s":\n    %s' % (len(parsed_input), merging, the_newname, '\n    '.join(parsed_input[:10]))
                if len(parsed_input) > 10:
                    print '... and %d more ... \n' % (len(parsed_input) - 10)
                else:
                    print ''
        # remove old entries
        temp = sum([df[i] for i in parsed_input])
        if not multiple_merge:
            if type(df) == pandas.core.series.Series:
                df = df.drop(parsed_input)
            else:
                df = df.drop(parsed_input, axis = 1)
        df[the_newname] = temp
        return df

    def just_these_subcorpora(df, lst_of_subcorpora, prinf = True):        
        if type(lst_of_subcorpora[0]) == int:
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if prinf:
            print 'Keeping %d subcorpora:\n    %s' % (len(good_years), '\n    '.join(good_years[:10]))
            if len(good_years) > 10:
                print '... and %d more ... \n' % (len(good_years) - 10)
            else:
                print ''
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis = 0)
        return df

    def skip_these_subcorpora(df, lst_of_subcorpora, prinf = True):
        if type(lst_of_subcorpora) == int:
            lst_of_subcorpora = [lst_of_subcorpora]
        if type(lst_of_subcorpora[0]) == int:
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if len(bad_years) == 0:
            import warnings
            warnings.warn('No subcorpora skipped.\n')
        else:
            if prinf:       
                print 'Skipping %d subcorpora:\n    %s' % (len(bad_years), '\n    '.join([str(i) for i in bad_years[:10]]))
                if len(bad_years) > 10:
                    print '... and %d more ... \n' % (len(bad_years) - 10)
                else:
                    print ''
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis = 0)
        return df

    def span_these_subcorpora(df, lst_of_subcorpora, prinf = True):
        non_totals = [subcorpus for subcorpus in list(df.index)]
        good_years = [subcorpus for subcorpus in non_totals if int(subcorpus) >= int(lst_of_subcorpora[0]) and int(subcorpus) <= int(lst_of_subcorpora[-1])]
        if len(lst_of_subcorpora) == 0:
            import warnings
            warnings.warn('Span not identified.\n')
        else:        
            if prinf:        
                print 'Keeping subcorpora:\n    %d--%d\n' % (int(lst_of_subcorpora[0]), int(lst_of_subcorpora[-1]))
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis = 0)
        # retotal needed here
        return df

    def projector(df, list_of_tuples, prinf = True):
        for subcorpus, projection_value in list_of_tuples:
            if type(subcorpus) == int:
                subcorpus = str(subcorpus)
            df.ix[subcorpus] = df.ix[subcorpus] * projection_value
            if prinf:
                if type(projection_value) == float:
                    print 'Projection: %s * %s' % (subcorpus, projection_value)
                if type(projection_value) == int:
                    print 'Projection: %s * %d' % (subcorpus, projection_value)
        if prinf:
            print ''
        return df

    def do_stats(df):
        """do linregress and add to df"""
        from scipy.stats import linregress
        entries = []
        slopes = []
        intercepts = []
        rs = []
        ps = []
        stderrs = []
        indices = list(df.index)
        first_year = list(df.index)[0]
        try:
            x = [int(y) - int(first_year) for y in indices]
        except ValueError:
            x = range(len(indices))
        statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
        for entry in list(df.columns):
            entries.append(entry)
            y = list(df[entry])
            slope, intercept, r, p, stderr = linregress(x, y)
            slopes.append(slope)
            intercepts.append(intercept)
            rs.append(r)
            ps.append(p)
            stderrs.append(stderr)
        sl = pd.DataFrame([slopes, intercepts, rs, ps, stderrs], 
                           index = statfields, 
                           columns = list(df.columns))
        df = df.append(sl)
        # drop infinites and nans
        if operation != 'd':
            df = df.replace([np.inf, -np.inf], np.nan)
            df = df.fillna(0.0)
        return df

    def recalc(df, operation = '%'):
        statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
        """Add totals to the dataframe1"""

        #df.drop('Total', axis = 0, inplace = True)
        #df.drop('Total', axis = 1, inplace = True)
        try:
            df['temp-Total'] = df.drop(statfields).sum(axis = 1)
        except:
            df['temp-Total'] = df.sum(axis = 1)
        df = df.T
        try:
            df['temp-Total'] = df.drop(statfields).sum(axis = 1)
        except:
            df['temp-Total'] = df.sum(axis = 1)
        df = df.T
        return df

    def resort(df, sort_by = False, keep_stats = False):
        """sort results"""
        # translate options and make sure they are parseable
        

        options = ['total', 'name', 'infreq', 'increase', 'turbulent',
                   'decrease', 'static', 'most', 'least', 'none']

        if sort_by is True:
            sort_by = 'total'
        if sort_by == 'most':
            sort_by = 'total'
        if sort_by == 'least':
            sort_by = 'infreq'
        if sort_by not in options:
            raise ValueError("sort_by parameter error: '%s' not recognised. Must be True, False, %s" % (sort_by, ', '.join(options)))

        if operation.startswith('k'):
            if type(df) == pandas.core.series.Series:
                if sort_by == 'total':
                    df = df.order(ascending = False)

                elif sort_by == 'infreq':
                    df = df.order(ascending = True)

                elif sort_by == 'name':
                    df = df.sort_index()
                return df

        if just_totals:
            if sort_by == 'infreq':
                df = df.sort(columns = 'Combined total', ascending = True)
            elif sort_by == 'total':
                df = df.sort(columns = 'Combined total', ascending = False)
            elif sort_by == 'name':
                df = df.sort_index()
            return df

        # this is really shitty now that i know how to sort, like in the above
        if keep_stats:
            df = do_stats(df)
        if sort_by == 'total':
            if df1_istotals:
                df = df.T
            df = recalc(df, operation = operation)
            tot = df.ix['temp-Total']
            df = df[tot.argsort()[::-1]]
            df = df.drop('temp-Total', axis = 0)
            df = df.drop('temp-Total', axis = 1)
            if df1_istotals:
                df = df.T
        elif sort_by == 'infreq':
            if df1_istotals:
                df = df.T
            df = recalc(df, operation = operation)
            tot = df.ix['temp-Total']
            df = df[tot.argsort()]
            df = df.drop('temp-Total', axis = 0)
            df = df.drop('temp-Total', axis = 1)
            if df1_istotals:
                df = df.T
        elif sort_by == 'name':
            # currently case sensitive...
            df = df.reindex_axis(sorted(df.columns), axis=1)
        else:
            statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
            
            if not keep_stats:
                df = do_stats(df)

            slopes = df.ix['slope']
            if sort_by == 'increase':
                df = df[slopes.argsort()[::-1]]
            elif sort_by == 'decrease':
                df = df[slopes.argsort()]
            elif sort_by == 'static':
                df = df[slopes.abs().argsort()]
            elif sort_by == 'turbulent':
                df = df[slopes.abs().argsort()[::-1]]
            if remove_above_p:
                # the easy way to do it!
                df = df.T
                df = df[df['p'] <= p]
                df = df.T

            # remove stats field by default
            if not keep_stats:
                df = df.drop(statfields, axis = 0)

        return df

    def set_threshold(big_list, threshold, prinf = True, for_keywords = False):
        if type(threshold) == str:
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500

            if type(big_list) == pandas.core.frame.DataFrame:
                tot = big_list.sum().sum()

            if type(big_list) == pandas.core.series.Series:
                tot = big_list.sum()
            the_threshold = float(tot) / float(denominator)
            #if for_keywords:
                #the_threshold = the_threshold / 2
        else:
            the_threshold = threshold
        if prinf:
            print 'Threshold: %d\n' % the_threshold
        return the_threshold

#####################################################

    # check if we're in concordance mode
    try:
        if list(dataframe1.columns) == ['l', 'm', 'r']:
            conc_lines = True
        else:
            conc_lines = False
    except:
        conc_lines = False

    # copy dataframe to be very safe
    try:
        df = dataframe1.copy()
    except AttributeError:
        no_good_dataframe1 = True
        while no_good_dataframe1:
            if 'interrogation' in str(type(dataframe1)):
                sel = raw_input("\nIt looks like you're trying to edit an interrogation, " \
                                  "rather than an interrogation's .results or .totals branch. You can:\n\n    a) select .results branch\n    b) select .totals branch\n    c) exit\n\nYour choice: ")
                if sel.startswith('a'):
                    try:
                        dataframe1 = dataframe1.results
                        no_good_dataframe1 = False
                    except:
                        pass
                elif sel.startswith('b'):
                    try:
                        dataframe1 = dataframe1.totals
                        no_good_dataframe1 = False
                    except:
                        pass
                else:
                    return
            else:
                raise ValueError("Thing to be edited (dataframe1) needs to be a Pandas DataFrame or Series. " \
                                  "Right now, its type is: '%s'." % type(dataframe1).__name__)

        df = dataframe1.copy()

    # make cols into strings
    try:
        df.columns = [str(c) for c in list(df.columns)]
    except:
        pass

    if operation is None:
        operation = 'None'

    # do concordance work
    if conc_lines:
        df = dataframe1.copy()

        if just_entries:
            if type(just_entries) == int:
                just_entries = [just_entries]
            if type(just_entries) == str:
                df = df[df['m'].str.contains(just_entries)]
            if type(just_entries) == list:
                if type(just_entries[0]) == str:
                    regex = re.compile(r'(?i)^(' + r'|'.join(just_entries) + r')$')
                    df = df[df['m'].str.contains(regex)]
                else:
                    df = df.ix[just_entries].reset_index(drop = True)

        if skip_entries:
            if type(skip_entries) == int:
                skip_entries = [skip_entries]
            if type(skip_entries) == str:
                df = df[~df['m'].str.contains(skip_entries)]
            if type(skip_entries) == list:
                if type(skip_entries[0]) == str:
                    regex = re.compile(r'(?i)^(' + r'|'.join(skip_entries) + r')$')
                    df = df[~df['m'].str.contains(regex)]
                else:
                    df = df.ix[[e for e in list(df.index) if e not in skip_entries]].reset_index(drop = True)

        return df

    if print_info:
        print '\n***Processing results***\n========================\n'

    df1_istotals = False
    if type(df) == pandas.core.series.Series:
        df1_istotals = True
        df = pandas.DataFrame(df)
        # if just a single result
    else:
        df = pandas.DataFrame(df)
    if operation.startswith('k'):
        if sort_by is False:
            if not df1_istotals:
                sort_by = 'turbulent'
        if df1_istotals:
            df = df.T
    
    # figure out if there's a second list
    # copy and remove totals if there is
    single_totals = True
    using_totals = False
    outputmode = False

    try:
        if dataframe2.empty is False:            
            df2 = dataframe2.copy()
            using_totals = True

            if type(df2) == pandas.core.frame.DataFrame:
                if len(df2.columns) > 1:
                    single_totals = False
                else:
                    df2 = pd.Series(df2)
                if operation == 'd':
                    df2 = df2.sum(axis = 1)
                    single_totals = True
            elif type(df2) == pandas.core.series.Series:
                single_totals = True
                #if operation == 'k':
                    #raise ValueError('Keywording requires a DataFrame for dataframe2. Use "self"?')
            else:
                raise ValueError('dataframe2 not recognised.')
    except AttributeError:
        if operation in ['k', 'd', 'a', '%', '/', '*']:
            dataframe2 = 'self'         
        if dataframe2 == 'self':
            outputmode = True
        else:
            if operation in ['-', '+']:
                raise ValueError('dataframe2 needed for +/- operations.')

    if operation.startswith('a') or operation.startswith('A'):
        if list(df.columns)[0] != '0' and list(df.columns)[0] != 0:
            df = df.T
        if using_totals:
            if not single_totals:
                df2 = df2.T

    if projection:
        # projection shouldn't do anything when working with '%', remember.
        df = projector(df, projection)
        if using_totals:
            df2 = projector(df2, projection)

    if spelling:
        df = convert_spell(df, convert_to = spelling)
        df = merge_duplicates(df, print_info = False)

        if not single_totals:
            df2 = convert_spell(df2, convert_to = spelling, print_info = False)
            df2 = merge_duplicates(df2, print_info = False)
        if not df1_istotals:
            sort_by = 'total'

    if replace_names:
        df = name_replacer(df, replace_names)
        df = merge_duplicates(df)
        if not single_totals:
            df2 = name_replacer(df2, print_info = False)
            df2 = merge_duplicates(df2, print_info = False)
        if not sort_by:
            sort_by = 'total'

    # remove old stats if they're there:
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        df = df.drop(statfields, axis = 0)
    except:
        pass
    if using_totals:
        try:
            df2 = df2.drop(statfields, axis = 0)
        except:
            pass

    # merging: make dicts if they aren't already, so we can iterate
    if merge_entries:
        if type(merge_entries) != dict:
            merge_entries = {newname: merge_entries}
        for name, input in sorted(merge_entries.items()):
            the_newname = newname_getter(df, parse_input(df, input), newname = name, prinf = print_info)
            df = merge_these_entries(df, parse_input(df, input), the_newname, prinf = print_info)
            if not single_totals:
                df2 = merge_these_entries(df2, parse_input(df2, input), the_newname, prinf = False)
    
    if merge_subcorpora:
        print merge_subcorpora
        print new_subcorpus_name
        if type(merge_subcorpora) != dict:
            if type(merge_subcorpora) == list:
                if type(merge_subcorpora[0]) == tuple:
                    merge_subcorpora = {x: y for x, y in merge_subcorpora}
                elif type(merge_subcorpora[0]) == str or type(merge_subcorpora[0]) == unicode:
                    merge_subcorpora = {new_subcorpus_name: [x for x in merge_subcorpora]}
                elif type(merge_subcorpora[0]) == int:
                    merge_subcorpora = {new_subcorpus_name: [str(x) for x in merge_subcorpora]}
            else:
                merge_subcorpora = {new_subcorpus_name: merge_subcorpora}
        for name, input in sorted(merge_subcorpora.items()):
            the_newname = newname_getter(df.T, parse_input(df.T, input), 
                                     newname = name, 
                                     merging_subcorpora = True,
                                     prinf = print_info)
            df = merge_these_entries(df.T, parse_input(df.T, input), the_newname, merging = 'subcorpora', prinf = print_info).T
            if using_totals:
                df2 = merge_these_entries(df2.T, parse_input(df2.T, input), the_newname, merging = 'subcorpora', prinf = False).T
    

    if just_subcorpora:
        df = just_these_subcorpora(df, just_subcorpora, prinf = print_info)
        if using_totals:
            df2 = just_these_subcorpora(df2, just_subcorpora, prinf = False)
    
    if skip_subcorpora:
        df = skip_these_subcorpora(df, skip_subcorpora, prinf = print_info)
        if using_totals:
            df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf = False)
    
    if span_subcorpora:
        df = span_these_subcorpora(df, span_subcorpora, prinf = print_info)
        if using_totals:
            df2 = span_these_subcorpora(df2, span_subcorpora, prinf = False)

    if just_entries:
        df = just_these_entries(df, parse_input(df, just_entries), prinf = print_info)
        if not single_totals:
            df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf = False)
    if skip_entries:
        df = skip_these_entries(df, parse_input(df, skip_entries), prinf = print_info)
        if not single_totals:
            df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf = False)

    # drop infinites and nans
    if operation != 'd':
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)

    # make just_totals as dataframe
    just_one_total_number = False
    if just_totals:
        df = pd.DataFrame(df.sum(), columns = ['Combined total'])
        if using_totals:
            if not single_totals:
                df2 = pd.DataFrame(df2.sum(), columns = ['Combined total'])
            else:
                just_one_total_number = True
                df2 = df2.sum()

    # combine lists

    tots = df.sum(axis = 1)

    if using_totals or outputmode:
        if not operation.startswith('k'):
            the_threshold = 0
            # set a threshold if just_totals
            if outputmode is True:
                df2 = df.T.sum()
                if not just_totals:
                    df2.name = 'Total'
                else:
                    df2.name = 'Combined total'
                using_totals = True
                single_totals = True
            if just_totals:
                if not single_totals:
                    the_threshold = set_threshold(df2, threshold, prinf = print_info)
            if operation == 'd':
                the_threshold = set_threshold(df2, threshold, prinf = print_info) 
            df, tots = combiney(df, df2, operation = operation, threshold = the_threshold, prinf = print_info)
    
    # if doing keywording...
    if operation.startswith('k'):
        from corpkit.keys import keywords

        # allow saved dicts to be df2, etc
        try:
            if dataframe2 == 'self':
                df2 = df.copy()
        except TypeError:
            pass
        if type(dataframe2) == str:
            if dataframe2 != 'self':
                df2 = dataframe2
    
        else:
            the_threshold = False

        df = keywords(df, df2, 
                      selfdrop = selfdrop, 
                      threshold = threshold, 
                      printstatus = print_info,
                      editing = True,
                      calc_all = calc_all,
                      **kwargs)

        # eh?
        df = df.T
    
    # drop infinites and nans
    if operation != 'd':
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)

    # resort data
    if sort_by:
        df = resort(df, keep_stats = keep_stats, sort_by = sort_by)

    if keep_top:
        if not just_totals:
            df = df[list(df.columns)[:keep_top]]
        else:
            df = df.head(keep_top)

    if just_totals:
        # turn just_totals into series:
        df = pd.Series(df['Combined total'], name = 'Combined total')

    if df1_istotals:
        if operation.startswith('k'):
            try:
                df = pd.Series(df.ix[dataframe1.name])
                df.name = '%s: keyness' % df.name
            except:
                df = df.iloc[0,:]
                df.name = 'keyness' % df.name

    # generate totals branch if not percentage results:
    if df1_istotals or operation.startswith('k'):
        try:
            total = pd.Series(df['Total'], name = 'Total')
        except:
            total = 'none'
        #total = df.copy()
    else:
        # might be wrong if using division or something...
        total = df.T.sum()

    if type(tots) != pandas.core.frame.DataFrame and type(tots) != pandas.core.series.Series:
        total = df.sum(axis = 1)
    else:
        total = tots

    if type(df) == pandas.core.frame.DataFrame:
        datatype = df.ix[0].dtype
    else:
        datatype = df.dtype

    # TURN INT COL NAMES INTO STR
    try:
        df.results.columns = [str(d) for d in list(df.results.columns)]
    except:
        pass

    #make named_tuple
    the_operation = 'none'
    if using_totals:
        the_operation = operation

    the_options = {}
    the_options['time_started'] = the_time_started
    the_options['function'] = 'editor'
    the_options['dataframe1'] = dataframe1
    the_options['operation'] = the_operation
    the_options['dataframe2'] = dataframe2
    the_options['datatype'] = datatype
    the_options['sort_by'] = sort_by
    the_options['keep_stats'] = keep_stats
    the_options['just_totals'] = just_totals
    the_options['threshold'] = threshold # can be wrong!
    the_options['just_entries'] = just_entries
    the_options['just_entries'] = just_entries
    the_options['skip_entries'] = skip_entries
    the_options['merge_entries'] = merge_entries
    the_options['newname'] = newname
    the_options['just_subcorpora'] = just_subcorpora
    the_options['skip_subcorpora'] = skip_subcorpora
    the_options['span_subcorpora'] = span_subcorpora
    the_options['merge_subcorpora'] = merge_subcorpora
    the_options['new_subcorpus_name'] = new_subcorpus_name
    the_options['projection'] = projection
    the_options['remove_above_p'] = remove_above_p
    the_options['p'] = p
    the_options['revert_year'] = revert_year
    the_options['print_info'] = print_info

    outputnames = collections.namedtuple('edited_interrogation', ['query', 'results', 'totals'])
    output = outputnames(the_options, df, total)

    #print '\nResult (sample)\n'
    if print_info:
        #if merge_entries or merge_subcorpora or span_subcorpora or just_subcorpora or \
           #just_entries or skip_entries or skip_subcorpora or printed_th or projection:
        print '***Done!***\n========================\n'
    #print df.head().T
    #print ''
    if operation.startswith('k') or just_totals or df1_istotals:
        pd.set_option('display.max_rows', 30)
    else:
        pd.set_option('display.max_rows', 15)
    pd.set_option('display.max_columns', 8)
    pd.set_option('max_colwidth',70)
    pd.set_option('display.width', 800)
    pd.set_option('expand_frame_repr', False)
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

    return output