def load_all_results(data_dir = 'saved_interrogations', only_concs = False, **kwargs): import corpkit """load every saved interrogation in data_dir into a dict""" import os import time from other import load_result from time import localtime, strftime def get_root_note(kwargs): if 'root' in kwargs.keys(): root = kwargs['root'] else: root = False if 'note' in kwargs.keys(): note = kwargs['note'] else: note = False return root, note root, note = get_root_note(kwargs) r = {} fs = [f for f in os.listdir(data_dir) if f.endswith('.p') or os.path.isdir(os.path.join(data_dir, f))] if note and len(fs) > 3: note.progvar.set(0) if len(fs) == 0: if not root: raise ValueError('No saved data found in %s' % data_dir) #else: #thetime = strftime("%H:%M:%S", localtime()) #if not only_concs: # print '%s: No saved interrogations found in %s' % (thetime, data_dir) #else: # print '%s: No saved concordances found in %s' % (thetime, data_dir) return l = 0 import pandas for index, finding in enumerate(fs): try: tmp = load_result(finding, loaddir = data_dir, only_concs = only_concs) if type(tmp) != pandas.core.frame.DataFrame: if not tmp: continue r[os.path.splitext(finding)[0]] = tmp time = strftime("%H:%M:%S", localtime()) print '%s: %s loaded as %s.' % (time, finding, os.path.splitext(finding)[0]) l += 1 except: time = strftime("%H:%M:%S", localtime()) print '%s: %s failed to load. Try using load_result to find out the matter.' % (time, finding) if note and len(fs) > 3: note.progvar.set((index + 1) * 100.0 / len(fs)) if root: root.update() time = strftime("%H:%M:%S", localtime()) print '%s: %d interrogations loaded from %s.' % (time, l, os.path.basename(data_dir)) return r
def load_all_results(data_dir = 'data/saved_interrogations'): """load every saved interrogation in data_dir into a dict""" import os import time from corpkit.other import load_result from time import localtime, strftime r = {} fs = [f for f in os.listdir(data_dir) if f.endswith('.p')] if len(fs) == 0: raise ValueError('No results found in %s' % datadir) for finding in fs: try: r[os.path.splitext(finding)[0]] = load_result(finding, loaddir = data_dir) time = strftime("%H:%M:%S", localtime()) print '%s: %s loaded as %s.' % (time, finding, os.path.splitext(finding)[0]) except: time = strftime("%H:%M:%S", localtime()) print '%s: %s failed to load. Try using load_result to find out the matter.' % (time, finding) return r
# <markdowncell> # Those already proficient with Python can use [Pandas' `plot()` function](http://pandas.pydata.org/pandas-docs/stable/visualization.html) if they like # <markdowncell> # Another neat thing you can do is save the results of an interrogation, so they don't have to be run the next time you load this notebook: # <codecell> # specify what to save, and a name for the file. from corpkit import save_result, load_result save_result(allwords, 'allwords') # <markdowncell> # You can then load these results: # <codecell> fromfile_allwords = load_result('allwords') fromfile_allwords.totals # <markdowncell> # ... or erase them from memory: # <codecell> fromfile_allwords = None # fromfile_allwords # <markdowncell> # ### `quickview()` # <markdowncell> # `quickview()` is a function that quickly shows the n most frequent items in a list. Its arguments are:
def quickview(results, n = 25): """view top n results of results. :param results: Interrogation/edited result to view :type results: corpkit.interrogation/pandas.core.frame.DataFrame :param n: Show top *n* results :type n: int """ import corpkit import pandas import numpy import os # handle dictionaries too: dictpath = 'dictionaries' savedpath = 'saved_interrogations' # too lazy to code this properly for every possible data type: if n == 'all': n = 9999 if type(results) == str: if os.path.isfile(os.path.join(dictpath, results)): import pickle from collections import Counter unpickled = pickle.load(open(os.path.join(dictpath, results), 'rb')) print '\nTop %d entries in %s:\n' % (n, os.path.join(dictpath, results)) for index, (w, f) in enumerate(unpickled.most_common(n)): fildex = '% 3d' % index print '%s: %s (n=%d)' %(fildex, w, f) return elif os.path.isfile(os.path.join(savedpath, results)): from corpkit import load_result print '\n%s loaded temporarily from file:\n' % results results = load_result(results) else: raise ValueError('File %s not found in saved_interrogations or dictionaries') if 'interrogation' in str(type(results)): clas = results.query['function'] if clas == 'interrogator': datatype = results.results.iloc[0,0].dtype if datatype == 'int64': option = 'total' else: option = '%' if results.query['query'] == 'keywords': option = 'keywords' elif results.query['query'] == 'ngrams': option = 'ngrams' try: results_branch = results.results resbranch = True except AttributeError: resbranch = False results_branch = results elif clas == 'editor': # currently, it's wrong if you edit keywords! oh well datatype = results.results.iloc[0,0].dtype if results.query['just_totals']: resbranch = False if results.results.dtype == 'int64': option = 'total' else: option = '%' results_branch = results.results else: if datatype == 'int64': option = 'total' else: option = '%' try: results_branch = results.results resbranch = True except AttributeError: resbranch = False if type(results) == pandas.core.frame.DataFrame: results_branch = results resbranch = True if type(results.iloc[0,0]) == numpy.int64: option = 'total' else: option = '%' elif type(results) == pandas.core.series.Series: resbranch = False results_branch = results if type(results.iloc[0]) == numpy.int64: option = 'total' else: option = '%' if results.name == 'keywords': option = 'series_keywords' if resbranch: the_list = list(results_branch)[:n] else: the_list = list(results_branch.index)[:n] for index, w in enumerate(the_list): fildex = '% 3d' % index if option == 'keywords': print '%s: %s' %(fildex, w) elif option == '%' or option == 'ratio': if 'interrogation' in str(type(results)): tot = results.totals[w] totstr = "%.3f" % tot print '%s: %s (%s%%)' % (fildex, w, totstr) else: print '%s: %s' % (fildex, w) elif option == 'series_keywords': tot = results_branch[w] print '%s: %s (k=%d)' %(fildex, w, tot) else: if resbranch: tot = sum(i for i in list(results_branch[w])) else: tot = results_branch[w] print '%s: %s (n=%d)' %(fildex, w, tot)
def quickview(results, n = 25): """view top n results of results. Ideally, pass it interrogator() or plotter output. It will also accept DatFrames or Series (i.e. .results or .totals branches.""" import corpkit import pandas import numpy import os # handle dictionaries too: dictpath = 'data/dictionaries' savedpath = 'data/saved_interrogations' if type(results) == str: if os.path.isfile(os.path.join(dictpath, results)): import pickle from collections import Counter unpickled = pickle.load(open(os.path.join(dictpath, results), 'rb')) print '\nTop %d entries in %s:\n' % (n, os.path.join(dictpath, results)) for index, (w, f) in enumerate(unpickled.most_common(n)): fildex = '% 3d' % index print '%s: %s (n=%d)' %(fildex, w, f) return elif os.path.isfile(os.path.join(savedpath, results)): from corpkit import load_result print '\n%s loaded temporarily from file:\n' % results results = load_result(results) else: raise ValueError('File %s not found in data/saved_interrogations or data/dictionaries') if 'interrogation' in str(type(results)): clas = results.query['function'] if clas == 'interrogator': datatype = results.query['datatype'] if datatype == 'float64': option = 'total' else: option = '%' if results.query['query'] == 'keywords': option = 'keywords' elif results.query['query'] == 'ngrams': option = 'ngrams' try: results_branch = results.results resbranch = True except AttributeError: resbranch = False results_branch = results elif clas == 'editor': # currently, it's wrong if you edit keywords! oh well datatype = results.query['datatype'] if results.query['just_totals']: resbranch = False if results.results.dtype == 'int64': option = 'total' else: option = '%' results_branch = results.results else: if datatype == 'int64': option = 'total' else: option = '%' try: results_branch = results.results resbranch = True except AttributeError: resbranch = False if type(results) == pandas.core.frame.DataFrame: results_branch = results resbranch = True if type(results.iloc[0][0]) == numpy.int64: option = 'total' else: option = '%' elif type(results) == pandas.core.series.Series: resbranch = False results_branch = results if type(results.iloc[0]) == numpy.int64: option = 'total' else: option = '%' if results.name == 'keywords': option = 'series_keywords' if resbranch: the_list = list(results_branch)[:n] else: the_list = list(results_branch.index)[:n] for index, w in enumerate(the_list): fildex = '% 3d' % index if option == 'keywords': print '%s: %s' %(fildex, w) elif option == '%' or option == 'ratio': print '%s: %s' % (fildex, w) elif option == 'series_keywords': tot = results_branch[w] print '%s: %s (kq=%d)' %(fildex, w, tot) else: if resbranch: tot = sum(i for i in list(results_branch[w])) else: tot = results_branch[w] print '%s: %s (n=%d)' %(fildex, w, tot)