Пример #1
0
def load_all_results(data_dir = 'saved_interrogations', only_concs = False, **kwargs):
    import corpkit
    """load every saved interrogation in data_dir into a dict"""
    import os
    import time
    from other import load_result
    from time import localtime, strftime
    
    def get_root_note(kwargs):
        if 'root' in kwargs.keys():
            root = kwargs['root']
        else:
            root = False
        if 'note' in kwargs.keys():
            note = kwargs['note']
        else:
            note = False       
        return root, note

    root, note = get_root_note(kwargs)

    r = {}
    fs = [f for f in os.listdir(data_dir) if f.endswith('.p') or os.path.isdir(os.path.join(data_dir, f))]
    if note and len(fs) > 3:
        note.progvar.set(0)
    if len(fs) == 0:
        if not root:
            raise ValueError('No saved data found in %s' % data_dir)
        #else:
            #thetime = strftime("%H:%M:%S", localtime())
            #if not only_concs:
            #    print '%s: No saved interrogations found in %s' % (thetime, data_dir)
            #else:
            #    print '%s: No saved concordances found in %s' % (thetime, data_dir)
            return
    l = 0
    import pandas
    for index, finding in enumerate(fs):
        try:
            tmp = load_result(finding, loaddir = data_dir, only_concs = only_concs)
            if type(tmp) != pandas.core.frame.DataFrame:
                if not tmp:
                    continue
            r[os.path.splitext(finding)[0]] = tmp
            time = strftime("%H:%M:%S", localtime())
            print '%s: %s loaded as %s.' % (time, finding, os.path.splitext(finding)[0])
            l += 1
        except:
            time = strftime("%H:%M:%S", localtime())
            print '%s: %s failed to load. Try using load_result to find out the matter.' % (time, finding)
        if note and len(fs) > 3:
            note.progvar.set((index + 1) * 100.0 / len(fs))
        if root:
            root.update()
    time = strftime("%H:%M:%S", localtime())
    print '%s: %d interrogations loaded from %s.' % (time, l, os.path.basename(data_dir))
    return r
Пример #2
0
def load_all_results(data_dir = 'data/saved_interrogations'):
    """load every saved interrogation in data_dir into a dict"""
    import os
    import time
    from corpkit.other import load_result
    from time import localtime, strftime
    r = {}
    fs = [f for f in os.listdir(data_dir) if f.endswith('.p')]
    if len(fs) == 0:
        raise ValueError('No results found in %s' % datadir)
    for finding in fs:
        try:
            r[os.path.splitext(finding)[0]] = load_result(finding, loaddir = data_dir)
            time = strftime("%H:%M:%S", localtime())
            print '%s: %s loaded as %s.' % (time, finding, os.path.splitext(finding)[0])
        except:
            time = strftime("%H:%M:%S", localtime())
            print '%s: %s failed to load. Try using load_result to find out the matter.' % (time, finding)
    return r
Пример #3
0
# <markdowncell>
# Those already proficient with Python can use [Pandas' `plot()` function](http://pandas.pydata.org/pandas-docs/stable/visualization.html) if they like

# <markdowncell>
# Another neat thing you can do is save the results of an interrogation, so they don't have to be run the next time you load this notebook:

# <codecell>
# specify what to save, and a name for the file.
from corpkit import save_result, load_result
save_result(allwords, 'allwords')

# <markdowncell>
# You can then load these results:

# <codecell>
fromfile_allwords = load_result('allwords')
fromfile_allwords.totals

# <markdowncell>
# ... or erase them from memory:

# <codecell>
fromfile_allwords = None
# fromfile_allwords

# <markdowncell>
# ### `quickview()`

# <markdowncell>
# `quickview()` is a function that quickly shows the n most frequent items in a list. Its arguments are:
Пример #4
0
def quickview(results, n = 25):
    """view top n results of results.

    :param results: Interrogation/edited result to view
    :type results: corpkit.interrogation/pandas.core.frame.DataFrame
    :param n: Show top *n* results
    :type n: int
    """

    import corpkit
    import pandas
    import numpy
    import os

    # handle dictionaries too:
    dictpath = 'dictionaries'
    savedpath = 'saved_interrogations'

    # too lazy to code this properly for every possible data type:
    if n == 'all':
        n = 9999

    if type(results) == str:
        if os.path.isfile(os.path.join(dictpath, results)):
            import pickle
            from collections import Counter
            unpickled = pickle.load(open(os.path.join(dictpath, results), 'rb'))
            print '\nTop %d entries in %s:\n' % (n, os.path.join(dictpath, results))
            for index, (w, f) in enumerate(unpickled.most_common(n)):
                fildex = '% 3d' % index
                print '%s: %s (n=%d)' %(fildex, w, f)
            return

        elif os.path.isfile(os.path.join(savedpath, results)):
            from corpkit import load_result
            print '\n%s loaded temporarily from file:\n' % results
            results = load_result(results)
        else:
            raise ValueError('File %s not found in saved_interrogations or dictionaries')

    if 'interrogation' in str(type(results)):
        clas = results.query['function']
        if clas == 'interrogator':
            datatype = results.results.iloc[0,0].dtype
            if datatype == 'int64':
                option = 'total'
            else:
                option = '%'
            if results.query['query'] == 'keywords':
                option = 'keywords'
            elif results.query['query'] == 'ngrams':
                option = 'ngrams'

            try:
                results_branch = results.results
                resbranch = True
            except AttributeError:
                resbranch = False
                results_branch = results

        elif clas == 'editor':
            # currently, it's wrong if you edit keywords! oh well
            datatype = results.results.iloc[0,0].dtype
            if results.query['just_totals']:
                resbranch = False
                if results.results.dtype == 'int64':
                    option = 'total'
                else:
                    option = '%' 
                results_branch = results.results
            else:
                if datatype == 'int64':
                    option = 'total'
                else:
                    option = '%'
                try:
                    results_branch = results.results
                    resbranch = True
                except AttributeError:
                    resbranch = False

    if type(results) == pandas.core.frame.DataFrame:
        results_branch = results
        resbranch = True
        if type(results.iloc[0,0]) == numpy.int64:
            option = 'total'
        else:
            option = '%'
    elif type(results) == pandas.core.series.Series:
        resbranch = False
        results_branch = results
        if type(results.iloc[0]) == numpy.int64:
            option = 'total'
        else:
            option = '%'
        if results.name == 'keywords':
            option = 'series_keywords'

    if resbranch:
        the_list = list(results_branch)[:n]
    else:
        the_list = list(results_branch.index)[:n]

    for index, w in enumerate(the_list):
        fildex = '% 3d' % index
        if option == 'keywords':
            print '%s: %s' %(fildex, w)
        elif option == '%' or option == 'ratio':
            if 'interrogation' in str(type(results)):
                tot = results.totals[w]
                totstr = "%.3f" % tot
                print '%s: %s (%s%%)' % (fildex, w, totstr)
            else:
                print '%s: %s' % (fildex, w)
        elif option == 'series_keywords':
            tot = results_branch[w]
            print '%s: %s (k=%d)' %(fildex, w, tot)

        else:
            if resbranch:
                tot = sum(i for i in list(results_branch[w]))
            else:
                tot = results_branch[w]
            print '%s: %s (n=%d)' %(fildex, w, tot)
Пример #5
0
def quickview(results, n = 25):
    """view top n results of results.

    Ideally, pass it interrogator() or plotter output. It will also accept DatFrames
    or Series (i.e. .results or .totals branches."""

    import corpkit
    import pandas
    import numpy
    import os

    # handle dictionaries too:
    dictpath = 'data/dictionaries'
    savedpath = 'data/saved_interrogations'

    if type(results) == str:
        if os.path.isfile(os.path.join(dictpath, results)):
            import pickle
            from collections import Counter
            unpickled = pickle.load(open(os.path.join(dictpath, results), 'rb'))
            print '\nTop %d entries in %s:\n' % (n, os.path.join(dictpath, results))
            for index, (w, f) in enumerate(unpickled.most_common(n)):
                fildex = '% 3d' % index
                print '%s: %s (n=%d)' %(fildex, w, f)
            return

        elif os.path.isfile(os.path.join(savedpath, results)):
            from corpkit import load_result
            print '\n%s loaded temporarily from file:\n' % results
            results = load_result(results)
        else:
            raise ValueError('File %s not found in data/saved_interrogations or data/dictionaries')

    if 'interrogation' in str(type(results)):
        clas = results.query['function']

        if clas == 'interrogator':
            datatype = results.query['datatype']
            if datatype == 'float64':
                option = 'total'
            else:
                option = '%'
            if results.query['query'] == 'keywords':
                option = 'keywords'
            elif results.query['query'] == 'ngrams':
                option = 'ngrams'

            try:
                results_branch = results.results
                resbranch = True
            except AttributeError:
                resbranch = False
                results_branch = results

        elif clas == 'editor':
            # currently, it's wrong if you edit keywords! oh well
            datatype = results.query['datatype']
            if results.query['just_totals']:
                resbranch = False
                if results.results.dtype == 'int64':
                    option = 'total'
                else:
                    option = '%' 
                results_branch = results.results
            else:
                if datatype == 'int64':
                    option = 'total'
                else:
                    option = '%'
                try:
                    results_branch = results.results
                    resbranch = True
                except AttributeError:
                    resbranch = False

    if type(results) == pandas.core.frame.DataFrame:
        results_branch = results
        resbranch = True
        if type(results.iloc[0][0]) == numpy.int64:
            option = 'total'
        else:
            option = '%'

    elif type(results) == pandas.core.series.Series:
        resbranch = False
        results_branch = results
        if type(results.iloc[0]) == numpy.int64:
            option = 'total'
        else:
            option = '%'
        if results.name == 'keywords':
            option = 'series_keywords'

    if resbranch:
        the_list = list(results_branch)[:n]
    else:
        the_list = list(results_branch.index)[:n]

    for index, w in enumerate(the_list):
        fildex = '% 3d' % index
        if option == 'keywords':
            print '%s: %s' %(fildex, w)
        elif option == '%' or option == 'ratio':
            print '%s: %s' % (fildex, w)
        elif option == 'series_keywords':
            tot = results_branch[w]
            print '%s: %s (kq=%d)' %(fildex, w, tot)

        else:
            if resbranch:
                tot = sum(i for i in list(results_branch[w]))
            else:
                tot = results_branch[w]
            print '%s: %s (n=%d)' %(fildex, w, tot)