def check_interpreter_saved_interro(): """ Interpreter made a pickled result. Check it """ import pandas as pd import shutil from corpkit import load dat = load('test-speak-parsed-anylemma') shutil.rmtree('saved_interrogations') assert hasattr(dat, 'results') assert hasattr(dat, 'totals') assert hasattr(dat, 'query') assert('concordancing' in dat.results) rel = dat.results.T / dat.totals assert_equals(rel.ix[0].sum().round(2), 0.19)
def load_all_results(data_dir='saved_interrogations', **kwargs): """ Load every saved interrogation in data_dir into a dict: >>> r = load_all_results() :param data_dir: path to saved data :type data_dir: str :returns: dict with filenames as keys """ import os from time import localtime, strftime from other import load from process import makesafe root = kwargs.get('root', False) note = kwargs.get('note', False) datafiles = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f)) \ and f.endswith('.p')] output = {} l = 0 for index, f in enumerate(datafiles): try: loadname = f.replace('.p', '') output[loadname] = load(f, loaddir=data_dir) time = strftime("%H:%M:%S", localtime()) print('%s: %s loaded as %s.' % (time, f, makesafe(loadname))) l += 1 except: time = strftime("%H:%M:%S", localtime()) print( '%s: %s failed to load. Try using load to find out the matter.' % (time, f)) if note and len(datafiles) > 3: note.progvar.set((index + 1) * 100.0 / len(datafiles)) if root: root.update() time = strftime("%H:%M:%S", localtime()) print('%s: %d interrogations loaded from %s.' % (time, l, os.path.basename(data_dir))) from interrogation import Interrodict return Interrodict(output)
def load_all_results(data_dir = 'saved_interrogations', **kwargs): """ Load every saved interrogation in data_dir into a dict: >>> r = load_all_results() :param data_dir: path to saved data :type data_dir: str :returns: dict with filenames as keys """ import os from time import localtime, strftime from other import load from process import makesafe root = kwargs.get('root', False) note = kwargs.get('note', False) datafiles = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f)) \ and f.endswith('.p')] output = {} l = 0 for index, f in enumerate(datafiles): try: loadname = f.replace('.p', '') output[loadname] = load(f, loaddir = data_dir) time = strftime("%H:%M:%S", localtime()) print('%s: %s loaded as %s.' % (time, f, makesafe(loadname))) l += 1 except: time = strftime("%H:%M:%S", localtime()) print('%s: %s failed to load. Try using load to find out the matter.' % (time, f)) if note and len(datafiles) > 3: note.progvar.set((index + 1) * 100.0 / len(datafiles)) if root: root.update() time = strftime("%H:%M:%S", localtime()) print('%s: %d interrogations loaded from %s.' % (time, l, os.path.basename(data_dir))) from interrogation import Interrodict return Interrodict(output)
def quickview(results, n = 25): """view top n results as painlessly as possible. :param results: Interrogation data :type results: :class:``corpkit.interrogation.Interrogation`` :param n: Show top *n* results :type n: int :returns: None """ import corpkit import pandas as pd import numpy as np import os import corpkit from interrogation import Interrogation, Results, Totals # handle dictionaries too: dictpath = 'dictionaries' savedpath = 'saved_interrogations' # too lazy to code this properly for every possible data type: if n == 'all': n = 9999 dtype = corpkit.interrogation.Interrogation if type(results) == str: if os.path.isfile(os.path.join(dictpath, results)): import pickle from collections import Counter unpickled = pickle.load(open(os.path.join(dictpath, results), 'rb')) print('\nTop %d entries in %s:\n' % (n, os.path.join(dictpath, results))) for index, (w, f) in enumerate(unpickled.most_common(n)): fildex = '% 3d' % index print('%s: %s (n=%d)' %(fildex, w, f)) return elif os.path.isfile(os.path.join(savedpath, results)): from corpkit import load print('\n%s loaded temporarily from file:\n' % results) results = load(results) else: raise ValueError('File %s not found in saved_interrogations or dictionaries') if results.__class__ == corpkit.interrogation.Results: if results.iloc[0,0].dtype == 'int64': option = 't' else: option = '%' the_list = list(results.columns)[:n] dtype = corpkit.interrogation.Results elif results.__class__ == corpkit.interrogation.Totals: if results.iloc[0].dtype == 'int64': option = 't' else: option = '%' the_list = list(results.index)[:n] dtype = corpkit.interrogation.Totals elif results.__class__ == corpkit.interrogation.Interrogation: if 'results' in list(results.__dict__.keys()): datatype = results.results.iloc[0,0].dtype if datatype == 'int64': option = 't' else: option = '%' if 'operation' in results.query: if results.query['operation'].lower().startswith('k'): option = 'k' if results.query['operation'].lower().startswith('%'): option = '%' if results.query['operation'].lower().startswith('/'): option = '/' try: the_list = list(results.results.columns)[:n] except: the_list = list(results.results.index)[:n] else: print(results.totals) return else: raise ValueError('Results not recognised.') # get longest word length for justification longest = max([len(i) for i in the_list]) for index, entry in enumerate(the_list): if option == 't': if dtype == corpkit.interrogation.Interrogation: to_get_from = results.results elif dtype == corpkit.interrogation.Results: to_get_from = results elif dtype == corpkit.interrogation.Totals: to_get_from = results tot = to_get_from[entry].sum() print('%s: %s (n=%d)' %(str(index).rjust(3), entry.ljust(longest), tot)) elif option == '%' or option == '/': if dtype == corpkit.interrogation.Interrogation: to_get_from = results.totals tot = to_get_from[entry] totstr = "%.3f" % tot print('%s: %s (%s%%)' % (str(index).rjust(3), entry.ljust(longest), totstr)) elif dtype == corpkit.interrogation.Results: print('%s: %s (%s)' %(str(index).rjust(3), entry.ljust(longest), option)) elif dtype == corpkit.interrogation.Totals: tot = results[entry] totstr = "%.3f" % tot print('%s: %s (%s%%)' % (str(index).rjust(3), entry.ljust(longest), totstr)) elif option == 'k': print('%s: %s (l/l)' %(str(index).rjust(3), entry.ljust(longest))) else: print('%s: %s' %(str(index).rjust(3), entry.ljust(longest)))
def quickview(results, n=25): """view top n results as painlessly as possible. :param results: Interrogation data :type results: :class:``corpkit.interrogation.Interrogation`` :param n: Show top *n* results :type n: int :returns: None """ import corpkit import pandas as pd import numpy as np import os import corpkit from interrogation import Interrogation, Results, Totals # handle dictionaries too: dictpath = 'dictionaries' savedpath = 'saved_interrogations' # too lazy to code this properly for every possible data type: if n == 'all': n = 9999 dtype = corpkit.interrogation.Interrogation if type(results) == str: if os.path.isfile(os.path.join(dictpath, results)): try: import cPickle as pickle except ImportError: import pickle as pickle from collections import Counter unpickled = pickle.load(open(os.path.join(dictpath, results), 'rb')) print('\nTop %d entries in %s:\n' % (n, os.path.join(dictpath, results))) for index, (w, f) in enumerate(unpickled.most_common(n)): fildex = '% 3d' % index print('%s: %s (n=%d)' % (fildex, w, f)) return elif os.path.isfile(os.path.join(savedpath, results)): from corpkit import load print('\n%s loaded temporarily from file:\n' % results) results = load(results) else: raise ValueError( 'File %s not found in saved_interrogations or dictionaries') if results.__class__ == corpkit.interrogation.Results: if results.iloc[0, 0].dtype == 'int64': option = 't' else: option = '%' the_list = list(results.columns)[:n] dtype = corpkit.interrogation.Results elif results.__class__ == corpkit.interrogation.Totals: if results.iloc[0].dtype == 'int64': option = 't' else: option = '%' the_list = list(results.index)[:n] dtype = corpkit.interrogation.Totals elif results.__class__ == corpkit.interrogation.Interrogation: if 'results' in list(results.__dict__.keys()): datatype = results.results.iloc[0, 0].dtype if datatype == 'int64': option = 't' else: option = '%' if 'operation' in results.query: if results.query['operation'].lower().startswith('k'): option = 'k' if results.query['operation'].lower().startswith('%'): option = '%' if results.query['operation'].lower().startswith('/'): option = '/' try: the_list = list(results.results.columns)[:n] except: the_list = list(results.results.index)[:n] else: print(results.totals) return else: raise ValueError('Results not recognised.') # get longest word length for justification longest = max([len(i) for i in the_list]) for index, entry in enumerate(the_list): if option == 't': if dtype == corpkit.interrogation.Interrogation: to_get_from = results.results elif dtype == corpkit.interrogation.Results: to_get_from = results elif dtype == corpkit.interrogation.Totals: to_get_from = results tot = to_get_from[entry].sum() print('%s: %s (n=%d)' % (str(index).rjust(3), entry.ljust(longest), tot)) elif option == '%' or option == '/': if dtype == corpkit.interrogation.Interrogation: to_get_from = results.totals tot = to_get_from[entry] totstr = "%.3f" % tot print('%s: %s (%s%%)' % (str(index).rjust(3), entry.ljust(longest), totstr)) elif dtype == corpkit.interrogation.Results: print('%s: %s (%s)' % (str(index).rjust(3), entry.ljust(longest), option)) elif dtype == corpkit.interrogation.Totals: tot = results[entry] totstr = "%.3f" % tot print('%s: %s (%s%%)' % (str(index).rjust(3), entry.ljust(longest), totstr)) elif option == 'k': print('%s: %s (l/l)' % (str(index).rjust(3), entry.ljust(longest))) else: print('%s: %s' % (str(index).rjust(3), entry.ljust(longest)))