def loader(savedir='saved_interrogations'): """Show a list of data that can be loaded, and then load by user input of index""" import glob import os import corpkit from corpkit.other import load fs = [ i for i in glob.glob(r'%s/*' % savedir) if not os.path.basename(i).startswith('.') ] string_to_show = '\nFiles in %s:\n' % savedir most_digits = max([len(str(i)) for i, j in enumerate(fs)]) for index, fname in enumerate(fs): string_to_show += str(index).rjust( most_digits) + ':\t' + os.path.basename(fname) + '\n' print(string_to_show) INPUTFUNC('Enter index of item to load: ') if ' ' in index or '=' in index: if '=' in index: index = index.replace(' = ', ' ') index = index.replace('=', ' ') varname, ind = index.split(' ', 1) globals()[varname] = load(os.path.basename(fs[int(ind)])) print("%s = %s. Don't do this again." % (varname, os.path.basename(fs[int(ind)]))) return try: index = int(index) except: raise ValueError('Selection not recognised.') return load(os.path.basename(fs[index]))
def features(self): """ Generate and show basic stats from the corpus, including number of sentences, clauses, process types, etc. :Example: >>> corpus.features SB Characters Tokens Words Closed class words Open class words Clauses 01 26873 8513 7308 4809 3704 2212 02 25844 7933 6920 4313 3620 2270 03 18376 5683 4877 3067 2616 1640 04 20066 6354 5366 3587 2767 1775 """ import os from os.path import isfile, isdir, join from corpkit.interrogator import interrogator from corpkit.other import load from corpkit.dictionaries import mergetags savedir = 'saved_interrogations' if isfile(join(savedir, self.name + '-features.p')): try: return load(self.name + '-features').results except AttributeError: return load(self.name + '-features') else: feat = interrogator(self, 's', 'any').results if isdir(savedir): feat.save(self.name + '-features') return feat
def make_language_model(self, name, **kwargs): """ Make a language model for the corpus :param name: a name for the model :type name: `str` :param kwargs: keyword arguments for the interrogate() method :type kwargs: `keyword arguments` :returns: a :class:`corpkit.model.MultiModel` """ import os from corpkit.other import load from corpkit.model import MultiModel if not name.endswith('.p'): namep = name + '.p' else: namep = name pth = os.path.join('models', namep) if os.path.isfile(pth): print('Returning saved model: %s' % pth) return MultiModel(load(name, loaddir='models')) # set some defaults if not passed in as kwargs search = kwargs.pop('search', {'i': r'^1$'}) langmod = not any(i.startswith('n') for i in search.keys()) res = self.interrogate(search, language_model=langmod, **kwargs) return res.language_model(name, search=search, **kwargs)
def lexicon(self, **kwargs): """ Get a lexicon/frequency distribution from a corpus, and save to disk for next time. :param kwargs: Arguments to pass to the :func:`~corpkit.interrogation.Interrogation.interrogate` method :type kwargs: `keyword arguments` :returns: a `DataFrame` of tokens and counts """ from os.path import join, isfile, isdir from corpkit.interrogator import interrogator from corpkit.other import load show = kwargs.get('show', ['w']) savedir = 'saved_interrogations' if isinstance(show, STRINGTYPE): show = [show] if isfile(join(savedir, self.name + '-lexicon.p')): try: return load(self.name + '-lexicon') except AttributeError: pass dat = self.interrogate('w', show=show, **kwargs).results if isdir(savedir): dat.save(self.name + '-lexicon') return dat
def __init__(self, data, name='', order=3, **kwargs): import os from corpkit.other import load if isinstance(data, STRINGTYPE): name = data if not name.endswith('.p'): name = name + '.p' self.name = name self.order = order self.kwargs = kwargs if os.path.isfile(self.name): data = load(self.name, loaddir='models') else: pth = os.path.join('models', self.name) if os.path.isfile(pth): data = load(self.name, loaddir='models') super(MultiModel, self).__init__(data)
def __init__(self, data, order, name='', **kwargs): import os from corpkit.other import load if isinstance(data, STRINGTYPE): name = data if not name.endswith('.p'): name = name + '.p' self.name = name self.order = order self.kwargs = kwargs if os.path.isfile(self.name): data = load(self.name, loaddir='models') else: pth = os.path.join('models', self.name) if os.path.isfile(pth): data = load(self.name, loaddir='models') super(MultiModel, self).__init__(data)
def wordclasses(self): """ Generate and show basic stats from the corpus, including number of sentences, clauses, process types, etc. :Example: >>> corpus.wordclasses SB Verb Noun Preposition Determiner ... 01 26873 8513 7308 5508 ... 02 25844 7933 6920 3323 ... 03 18376 5683 4877 3137 ... 04 20066 6354 5366 4336 ... """ import os from os.path import isfile, isdir, join from corpkit.interrogator import interrogator from corpkit.other import load from corpkit.dictionaries import mergetags savedir = 'saved_interrogations' if isfile(join(savedir, self.name + '-wordclasses.p')): try: return load(self.name + '-wordclasses').results except AttributeError: return load(self.name + '-wordclasses') elif isfile(join(savedir, self.name + '-postags.p')): try: posdata = load(self.name + '-postags').results except AttributeError: posdata = load(self.name + '-postags') return posdata.edit( merge_entries=mergetags, sort_by='total').results else: feat = interrogator(self, 't', 'any', show='pl').results if isdir(savedir): feat.save(self.name + '-wordclasses') return feat
def load_all_results(data_dir='saved_interrogations', **kwargs): """ Load every saved interrogation in data_dir into a dict: >>> r = load_all_results() :param data_dir: path to saved data :type data_dir: str :returns: dict with filenames as keys """ import os from time import localtime, strftime from other import load from process import makesafe root = kwargs.get('root', False) note = kwargs.get('note', False) datafiles = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f)) \ and f.endswith('.p')] # just load first n (for testing) if kwargs.get('n', False): datafiles = datafiles[:kwargs['n']] output = {} l = 0 for index, f in enumerate(datafiles): try: loadname = f.replace('.p', '') output[loadname] = load(f, loaddir=data_dir) time = strftime("%H:%M:%S", localtime()) print('%s: %s loaded as %s.' % (time, f, makesafe(loadname))) l += 1 except: time = strftime("%H:%M:%S", localtime()) print( '%s: %s failed to load. Try using load to find out the matter.' % (time, f)) if note and len(datafiles) > 3: note.progvar.set((index + 1) * 100.0 / len(datafiles)) if root: root.update() time = strftime("%H:%M:%S", localtime()) print('%s: %d interrogations loaded from %s.' % (time, l, os.path.basename(data_dir))) from interrogation import Interrodict return Interrodict(output)
def postags(self): """ Generate and show basic stats from the corpus, including number of sentences, clauses, process types, etc. :Example: >>> corpus.postags SB NN VB JJ IN DT 01 26873 8513 7308 4809 3704 ... 02 25844 7933 6920 4313 3620 ... 03 18376 5683 4877 3067 2616 ... 04 20066 6354 5366 3587 2767 ... """ import os from os.path import isfile, isdir, join from corpkit.interrogator import interrogator from corpkit.other import load from corpkit.dictionaries import mergetags savedir = 'saved_interrogations' if isfile(join(savedir, self.name + '-postags.p')): try: return load(self.name + '-postags').results except AttributeError: return load(self.name + '-postags') else: feat = interrogator(self, 't', 'any', show='p').results if isdir(savedir): feat.save(self.name + '-postags') wordclss = feat.edit( merge_entries=mergetags, sort_by='total').results wordclss.save(self.name + '-wordclasses') return feat
def _make_model_from_interro(self, name, **kwargs): import os from pandas import DataFrame, Series from collections import Counter from corpkit.other import load nosave = kwargs.get('nosave') singlemod = kwargs.get('singlemod') if not nosave: if not name.endswith('.p'): name = name + '.p' pth = os.path.join('models', name) if os.path.isfile(pth): return load(name, loaddir='models') scores = {} if not hasattr(self, 'results'): raise ValueError('Need results attribute to make language model.') # determine what we iterate over if not singlemod: to_iter_over = [(nm, self.results.ix[nm][self.results.ix[nm] > 0]) \ for nm in list(self.results.index)] else: if isinstance(self.results, Series): to_iter_over = [(name, self.results)] else: to_iter_over = [(name, self.results.sum())] try: tot = self.results.sum()[self.results.sum() > 0] to_iter_over.append(('Corpus', tot)) except: pass for subname, subc in list(to_iter_over): # get name for file dat = Counter(subc.to_dict()) model = _train(dat, subname, name, **kwargs) scores[subname] = model if singlemod: return scores.values()[0] mm = MultiModel(scores, name=name, order=kwargs.pop('order', 3), **kwargs) if not os.path.isfile(os.path.join('models', name)): from corpkit.other import save save(scores, name, savedir='models') print('Done!\n') return mm
def _make_model_from_interro(self, name, order, **kwargs): import os from pandas import DataFrame, Series from corpkit.other import load nosave = kwargs.get('nosave') singlemod = kwargs.get('singlemod') if not nosave: if not name.endswith('.p'): name = name + '.p' pth = os.path.join('models', name) if os.path.isfile(pth): return load(name, loaddir='models') scores = {} if not hasattr(self, 'results'): raise ValueError('Need results attribute to make language model.') # determine what we iterate over if not singlemod: to_iter_over = [(nm, self.results.ix[nm][self.results.ix[nm] > 0]) \ for nm in list(self.results.index)] else: if isinstance(self.results, Series): to_iter_over = [(name, self.results)] else: to_iter_over = [(name, self.results.sum())] try: tot = self.results.sum()[self.results.sum() > 0] to_iter_over.append(('Corpus', tot)) except: pass for subname, subc in list(to_iter_over): # get name for file model = _train(subc, subname, name, order=order, **kwargs) scores[subname] = model if singlemod: return scores.values()[0] mm = MultiModel(scores, order=order, name=name, **kwargs) if not os.path.isfile(os.path.join('models', name)): from corpkit.other import save save(scores, name, savedir='models') print('Done!\n') return mm
def train(self, name, **kwargs): """ Load, make and save a model """ ofile = '%s-model.p' % name d = os.path.basename(os.path.dirname(self.path)) if not os.path.isdir('models'): os.makedirs('models') odir = os.path.join('models', d) if not os.path.isdir(odir): os.makedirs(odir) fp = os.path.join(odir, ofile) if os.path.isfile(fp): from corpkit.other import load return load(fp, loaddir='.') else: print('Making model: %s ... ' % name) lm = LanguageModel(kwargs.get('size', 3), kwargs.get('alpha', 0.4), sents) if not os.path.isfile(fp): with open(fp, 'wb') as fo: pickle.dump(lm, fo) return lm
def quickview(results, n=25): """ View top n results as painlessly as possible. :param results: Interrogation data :type results: :class:``corpkit.interrogation.Interrogation`` :param n: Show top *n* results :type n: int :returns: None """ import corpkit import pandas as pd import numpy as np import os import corpkit from corpkit.interrogation import Interrogation # handle dictionaries too: dictpath = 'dictionaries' savedpath = 'saved_interrogations' # too lazy to code this properly for every possible data type: if n == 'all': n = 9999 dtype = corpkit.interrogation.Interrogation if isinstance(results, STRINGTYPE): if os.path.isfile(os.path.join(dictpath, results)): from corpkit.other import load results = load(results, loaddir=dictpath) elif os.path.isfile(os.path.join(savedpath, results)): from corpkit.other import load results = load(results) else: raise OSError('File "%s" not found.' % os.path.abspath(results)) elif isinstance(results, Interrogation): if getattr(results, 'results'): datatype = results.results.iloc[0, 0].dtype if datatype == 'int64': option = 't' else: option = '%' rq = results.query.get('operation', False) if rq: rq = rq.lower() if rq.startswith('k'): option = 'k' if rq.startswith('%'): option = '%' if rq.startswith('/'): option = '/' try: the_list = list(results.results.columns)[:n] except: the_list = list(results.results.index)[:n] else: print(results.totals) return else: raise ValueError('Results not recognised.') # get longest word length for justification longest = max([len(i) for i in the_list]) for index, entry in enumerate(the_list): if option == 't': if isinstance(results, Interrogation): if hasattr(results, 'results'): to_get_from = results.results tot = to_get_from[entry].sum() else: to_get_from = results.totals tot = to_get_from[entry] print('%s: %s (n=%d)' % (str(index).rjust(3), entry.ljust(longest), tot)) elif option == '%' or option == '/': if isinstance(results, Interrogation): to_get_from = results.totals tot = to_get_from[entry] totstr = "%.3f" % tot print('%s: %s (%s%%)' % (str(index).rjust(3), entry.ljust(longest), totstr)) elif dtype == corpkit.interrogation.Results: print('%s: %s (%s)' % (str(index).rjust(3), entry.ljust(longest), option)) elif dtype == corpkit.interrogation.Totals: tot = results[entry] totstr = "%.3f" % tot print('%s: %s (%s%%)' % (str(index).rjust(3), entry.ljust(longest), totstr)) elif option == 'k': print('%s: %s (l/l)' % (str(index).rjust(3), entry.ljust(longest))) else: print('%s: %s' % (str(index).rjust(3), entry.ljust(longest)))
def __init__(self, path, **kwargs): import re import operator import glob import os from os.path import join, isfile, isdir, abspath, dirname, basename from corpkit.process import determine_datatype # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which # one is determined automatically below, and processed accordingly. We # assume it is a full corpus to begin with. self.data = None level = kwargs.pop('level', 'c') self.datatype = kwargs.pop('datatype', None) print_info = kwargs.get('print_info', True) if isinstance(path, (list, Datalist)): self.path = abspath(dirname(path[0].path.rstrip('/'))) self.name = basename(self.path) self.data = path elif isinstance(path, STRINGTYPE): self.path = abspath(path) self.name = basename(path) elif hasattr(path, 'path') and path.path: self.path = abspath(path.path) self.name = basename(path.path) # this messy code figures out as quickly as possible what the datatype # and singlefile status of the path is. it's messy because it shortcuts # full checking where possible some of the shortcutting could maybe be # moved into the determine_datatype() funct. self.singlefile = False if os.path.isfile(self.path): if self.path.endswith('.xml'): self.datatype = 'parse' self.singlefile = True else: if not isdir(self.path): if isdir(join('data', path)): self.path = abspath(join('data', path)) if self.path.endswith('-parsed'): self.datatype = 'parse' if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) > 0: self.singlefile = False if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) == 0: level = 's' else: if level == 'c': if not self.datatype: self.datatype, self.singlefile = determine_datatype( self.path) if isdir(self.path): if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) == 0: level = 's' # if initialised on a file, process as file if self.singlefile and level == 'c': level = 'f' self.level = level # load each interrogation as an attribute if kwargs.get('load_saved', False): from corpkit.other import load from corpkit.process import makesafe if os.path.isdir('saved_interrogations'): saved_files = glob.glob(r'saved_interrogations/*') for filepath in saved_files: filename = os.path.basename(filepath) if not filename.startswith(self.name): continue not_filename = filename.replace(self.name + '-', '') not_filename = os.path.splitext(not_filename)[0] if not_filename in ['features', 'wordclasses', 'postags']: continue variable_safe = makesafe(not_filename) try: setattr(self, variable_safe, load(filename)) if print_info: print( '\tLoaded %s as %s attribute.' % (filename, variable_safe)) except AttributeError: if print_info: print( '\tFailed to load %s as %s attribute. Name conflict?' % (filename, variable_safe)) if print_info: print('Corpus: %s' % self.path)