예제 #1
0
def loader(savedir='saved_interrogations'):
    """Show a list of data that can be loaded, and then load by user input of index"""
    import glob
    import os
    import corpkit
    from corpkit.other import load
    fs = [
        i for i in glob.glob(r'%s/*' % savedir)
        if not os.path.basename(i).startswith('.')
    ]
    string_to_show = '\nFiles in %s:\n' % savedir
    most_digits = max([len(str(i)) for i, j in enumerate(fs)])
    for index, fname in enumerate(fs):
        string_to_show += str(index).rjust(
            most_digits) + ':\t' + os.path.basename(fname) + '\n'
    print(string_to_show)
    INPUTFUNC('Enter index of item to load: ')
    if ' ' in index or '=' in index:
        if '=' in index:
            index = index.replace(' = ', ' ')
            index = index.replace('=', ' ')
        varname, ind = index.split(' ', 1)
        globals()[varname] = load(os.path.basename(fs[int(ind)]))
        print("%s = %s. Don't do this again." %
              (varname, os.path.basename(fs[int(ind)])))
        return
    try:
        index = int(index)
    except:
        raise ValueError('Selection not recognised.')
    return load(os.path.basename(fs[index]))
예제 #2
0
파일: corpus.py 프로젝트: javelir/corpkit
    def features(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.features
            SB  Characters  Tokens  Words  Closed class words  Open class words  Clauses
            01       26873    8513   7308                4809              3704     2212
            02       25844    7933   6920                4313              3620     2270
            03       18376    5683   4877                3067              2616     1640
            04       20066    6354   5366                3587              2767     1775

        """
        import os
        from os.path import isfile, isdir, join
        from corpkit.interrogator import interrogator
        from corpkit.other import load
        from corpkit.dictionaries import mergetags

        savedir = 'saved_interrogations'
        if isfile(join(savedir, self.name + '-features.p')):
            try:
                return load(self.name + '-features').results
            except AttributeError:
                return load(self.name + '-features')
        else:
            feat = interrogator(self, 's', 'any').results
            if isdir(savedir):
                feat.save(self.name + '-features')
            return feat
예제 #3
0
파일: corpus.py 프로젝트: javelir/corpkit
    def make_language_model(self, name, **kwargs):
        """
        Make a language model for the corpus

        :param name: a name for the model
        :type name: `str`

        :param kwargs: keyword arguments for the interrogate() method
        :type kwargs: `keyword arguments`

        :returns: a :class:`corpkit.model.MultiModel`
        """
        import os
        from corpkit.other import load
        from corpkit.model import MultiModel
        if not name.endswith('.p'):
            namep = name + '.p'
        else:
            namep = name
        pth = os.path.join('models', namep)
        if os.path.isfile(pth):
            print('Returning saved model: %s' % pth)
            return MultiModel(load(name, loaddir='models'))

        # set some defaults if not passed in as kwargs
        search = kwargs.pop('search', {'i': r'^1$'})
        langmod = not any(i.startswith('n') for i in search.keys())

        res = self.interrogate(search,
                               language_model=langmod,
                               **kwargs)

        return res.language_model(name, search=search, **kwargs)
예제 #4
0
파일: corpus.py 프로젝트: javelir/corpkit
    def lexicon(self, **kwargs):
        """
        Get a lexicon/frequency distribution from a corpus,
        and save to disk for next time.

        :param kwargs: Arguments to pass to the 
                       :func:`~corpkit.interrogation.Interrogation.interrogate` method
        :type kwargs: `keyword arguments`

        :returns: a `DataFrame` of tokens and counts
        """
        from os.path import join, isfile, isdir
        from corpkit.interrogator import interrogator
        from corpkit.other import load
        show = kwargs.get('show', ['w'])
        savedir = 'saved_interrogations'
        if isinstance(show, STRINGTYPE):
            show = [show]
        if isfile(join(savedir, self.name + '-lexicon.p')):
            try:
                return load(self.name + '-lexicon')
            except AttributeError:
                pass
        dat = self.interrogate('w', show=show, **kwargs).results
        if isdir(savedir):
            dat.save(self.name + '-lexicon')
        return dat
예제 #5
0
파일: model.py 프로젝트: javelir/corpkit
 def __init__(self, data, name='', order=3, **kwargs):
     import os
     from corpkit.other import load
     if isinstance(data, STRINGTYPE):
         name = data
         if not name.endswith('.p'):
             name = name + '.p'
     self.name = name
     self.order = order
     self.kwargs = kwargs
     if os.path.isfile(self.name):
         data = load(self.name, loaddir='models')
     else:
         pth = os.path.join('models', self.name)
         if os.path.isfile(pth):
             data = load(self.name, loaddir='models')
     super(MultiModel, self).__init__(data)
예제 #6
0
 def __init__(self, data, order, name='', **kwargs):
     import os
     from corpkit.other import load
     if isinstance(data, STRINGTYPE):
         name = data
         if not name.endswith('.p'):
             name = name + '.p'
     self.name = name
     self.order = order
     self.kwargs = kwargs
     if os.path.isfile(self.name):
         data = load(self.name, loaddir='models')
     else:
         pth = os.path.join('models', self.name)
         if os.path.isfile(pth):
             data = load(self.name, loaddir='models')
     super(MultiModel, self).__init__(data)
예제 #7
0
파일: corpus.py 프로젝트: javelir/corpkit
    def wordclasses(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.wordclasses
            SB   Verb  Noun  Preposition   Determiner ...
            01  26873  8513         7308         5508 ...
            02  25844  7933         6920         3323 ...
            03  18376  5683         4877         3137 ...
            04  20066  6354         5366         4336 ...

        """
        import os
        from os.path import isfile, isdir, join
        from corpkit.interrogator import interrogator
        from corpkit.other import load
        from corpkit.dictionaries import mergetags

        savedir = 'saved_interrogations'
        if isfile(join(savedir, self.name + '-wordclasses.p')):
            try:
                return load(self.name + '-wordclasses').results
            except AttributeError:
                return load(self.name + '-wordclasses')
        elif isfile(join(savedir, self.name + '-postags.p')):
            try:
                posdata = load(self.name + '-postags').results
            except AttributeError:
                posdata = load(self.name + '-postags')
            return posdata.edit(
                merge_entries=mergetags,
                sort_by='total').results
        else:
            feat = interrogator(self, 't', 'any', show='pl').results
            if isdir(savedir):
                feat.save(self.name + '-wordclasses')
            return feat
예제 #8
0
def load_all_results(data_dir='saved_interrogations', **kwargs):
    """
    Load every saved interrogation in data_dir into a dict:

        >>> r = load_all_results()

    :param data_dir: path to saved data
    :type data_dir: str

    :returns: dict with filenames as keys
    """
    import os
    from time import localtime, strftime
    from other import load
    from process import makesafe

    root = kwargs.get('root', False)
    note = kwargs.get('note', False)

    datafiles = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f)) \
                 and f.endswith('.p')]

    # just load first n (for testing)
    if kwargs.get('n', False):
        datafiles = datafiles[:kwargs['n']]

    output = {}

    l = 0
    for index, f in enumerate(datafiles):
        try:
            loadname = f.replace('.p', '')
            output[loadname] = load(f, loaddir=data_dir)
            time = strftime("%H:%M:%S", localtime())
            print('%s: %s loaded as %s.' % (time, f, makesafe(loadname)))
            l += 1
        except:
            time = strftime("%H:%M:%S", localtime())
            print(
                '%s: %s failed to load. Try using load to find out the matter.'
                % (time, f))
        if note and len(datafiles) > 3:
            note.progvar.set((index + 1) * 100.0 / len(datafiles))
        if root:
            root.update()
    time = strftime("%H:%M:%S", localtime())
    print('%s: %d interrogations loaded from %s.' %
          (time, l, os.path.basename(data_dir)))
    from interrogation import Interrodict
    return Interrodict(output)
예제 #9
0
파일: corpus.py 프로젝트: javelir/corpkit
    def postags(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.postags
            SB      NN     VB     JJ     IN     DT 
            01   26873   8513   7308   4809   3704  ...
            02   25844   7933   6920   4313   3620  ...
            03   18376   5683   4877   3067   2616  ...
            04   20066   6354   5366   3587   2767  ...

        """
        import os
        from os.path import isfile, isdir, join
        from corpkit.interrogator import interrogator
        from corpkit.other import load
        from corpkit.dictionaries import mergetags

        savedir = 'saved_interrogations'
        if isfile(join(savedir, self.name + '-postags.p')):
            try:
                return load(self.name + '-postags').results
            except AttributeError:
                return load(self.name + '-postags')
        else:
            feat = interrogator(self, 't', 'any', show='p').results
            if isdir(savedir):
                feat.save(self.name + '-postags')
                wordclss = feat.edit(
                    merge_entries=mergetags,
                    sort_by='total').results
                wordclss.save(self.name + '-wordclasses')
            return feat
예제 #10
0
파일: model.py 프로젝트: javelir/corpkit
def _make_model_from_interro(self, name, **kwargs):
    import os
    from pandas import DataFrame, Series
    from collections import Counter
    from corpkit.other import load
    nosave = kwargs.get('nosave')
    singlemod = kwargs.get('singlemod')
    if not nosave:
        if not name.endswith('.p'):
            name = name + '.p'
        pth = os.path.join('models', name)
        if os.path.isfile(pth):
            return load(name, loaddir='models')
    scores = {}
    if not hasattr(self, 'results'):
        raise ValueError('Need results attribute to make language model.')
    # determine what we iterate over
    if not singlemod:
        to_iter_over = [(nm, self.results.ix[nm][self.results.ix[nm] > 0]) \
                        for nm in list(self.results.index)]
    else:
        if isinstance(self.results, Series):
            to_iter_over = [(name, self.results)]
        else:
            to_iter_over = [(name, self.results.sum())]
    try:
        tot = self.results.sum()[self.results.sum() > 0]
        to_iter_over.append(('Corpus', tot))
    except:
        pass
    for subname, subc in list(to_iter_over):
        # get name for file
        dat = Counter(subc.to_dict())
        model = _train(dat, subname, name, **kwargs)
        scores[subname] = model
    if singlemod:
        return scores.values()[0]
    mm = MultiModel(scores, name=name, order=kwargs.pop('order', 3), **kwargs)
    if not os.path.isfile(os.path.join('models', name)):
        from corpkit.other import save
        save(scores, name, savedir='models')
    print('Done!\n')
    return mm
예제 #11
0
def _make_model_from_interro(self, name, order, **kwargs):
    import os
    from pandas import DataFrame, Series
    from corpkit.other import load
    nosave = kwargs.get('nosave')
    singlemod = kwargs.get('singlemod')
    if not nosave:
        if not name.endswith('.p'):
            name = name + '.p'
        pth = os.path.join('models', name)
        if os.path.isfile(pth):
            return load(name, loaddir='models')
    scores = {}
    if not hasattr(self, 'results'):
        raise ValueError('Need results attribute to make language model.')
    # determine what we iterate over
    if not singlemod:
        to_iter_over = [(nm, self.results.ix[nm][self.results.ix[nm] > 0]) \
                        for nm in list(self.results.index)]
    else:
        if isinstance(self.results, Series):
            to_iter_over = [(name, self.results)]
        else:
            to_iter_over = [(name, self.results.sum())]
    try:
        tot = self.results.sum()[self.results.sum() > 0]
        to_iter_over.append(('Corpus', tot))
    except:
        pass
    for subname, subc in list(to_iter_over):
        # get name for file
        model = _train(subc, subname, name, order=order, **kwargs)
        scores[subname] = model
    if singlemod:
        return scores.values()[0]
    mm = MultiModel(scores, order=order, name=name, **kwargs)
    if not os.path.isfile(os.path.join('models', name)):
        from corpkit.other import save
        save(scores, name, savedir='models')
    print('Done!\n')
    return mm
예제 #12
0
def train(self, name, **kwargs):
    """
    Load, make and save a model
    """
    ofile = '%s-model.p' % name
    d = os.path.basename(os.path.dirname(self.path))
    if not os.path.isdir('models'):
        os.makedirs('models')
    odir = os.path.join('models', d)
    if not os.path.isdir(odir):
        os.makedirs(odir)
    fp = os.path.join(odir, ofile)
    if os.path.isfile(fp):
        from corpkit.other import load
        return load(fp, loaddir='.')
    else:
        print('Making model: %s ... ' % name)

    lm = LanguageModel(kwargs.get('size', 3), kwargs.get('alpha', 0.4), sents)
    if not os.path.isfile(fp):
        with open(fp, 'wb') as fo:
            pickle.dump(lm, fo)
    return lm
예제 #13
0
def quickview(results, n=25):
    """
    View top n results as painlessly as possible.

    :param results: Interrogation data
    :type results: :class:``corpkit.interrogation.Interrogation``
    :param n: Show top *n* results
    :type n: int
    :returns: None
    """

    import corpkit
    import pandas as pd
    import numpy as np
    import os
    import corpkit
    from corpkit.interrogation import Interrogation

    # handle dictionaries too:
    dictpath = 'dictionaries'
    savedpath = 'saved_interrogations'

    # too lazy to code this properly for every possible data type:
    if n == 'all':
        n = 9999

    dtype = corpkit.interrogation.Interrogation

    if isinstance(results, STRINGTYPE):
        if os.path.isfile(os.path.join(dictpath, results)):
            from corpkit.other import load
            results = load(results, loaddir=dictpath)

        elif os.path.isfile(os.path.join(savedpath, results)):
            from corpkit.other import load
            results = load(results)
        else:
            raise OSError('File "%s" not found.' % os.path.abspath(results))

    elif isinstance(results, Interrogation):
        if getattr(results, 'results'):
            datatype = results.results.iloc[0, 0].dtype
            if datatype == 'int64':
                option = 't'
            else:
                option = '%'
            rq = results.query.get('operation', False)
            if rq:
                rq = rq.lower()
                if rq.startswith('k'):
                    option = 'k'
                if rq.startswith('%'):
                    option = '%'
                if rq.startswith('/'):
                    option = '/'
            try:
                the_list = list(results.results.columns)[:n]
            except:
                the_list = list(results.results.index)[:n]
        else:
            print(results.totals)
            return
    else:
        raise ValueError('Results not recognised.')

    # get longest word length for justification
    longest = max([len(i) for i in the_list])

    for index, entry in enumerate(the_list):
        if option == 't':
            if isinstance(results, Interrogation):
                if hasattr(results, 'results'):
                    to_get_from = results.results
                    tot = to_get_from[entry].sum()
                else:
                    to_get_from = results.totals
                    tot = to_get_from[entry]
            print('%s: %s (n=%d)' %
                  (str(index).rjust(3), entry.ljust(longest), tot))
        elif option == '%' or option == '/':
            if isinstance(results, Interrogation):
                to_get_from = results.totals
                tot = to_get_from[entry]
                totstr = "%.3f" % tot
                print('%s: %s (%s%%)' %
                      (str(index).rjust(3), entry.ljust(longest), totstr))
            elif dtype == corpkit.interrogation.Results:
                print('%s: %s (%s)' %
                      (str(index).rjust(3), entry.ljust(longest), option))
            elif dtype == corpkit.interrogation.Totals:
                tot = results[entry]
                totstr = "%.3f" % tot
                print('%s: %s (%s%%)' %
                      (str(index).rjust(3), entry.ljust(longest), totstr))
        elif option == 'k':
            print('%s: %s (l/l)' % (str(index).rjust(3), entry.ljust(longest)))
        else:
            print('%s: %s' % (str(index).rjust(3), entry.ljust(longest)))
예제 #14
0
파일: corpus.py 프로젝트: javelir/corpkit
    def __init__(self, path, **kwargs):
        import re
        import operator
        import glob
        import os
        from os.path import join, isfile, isdir, abspath, dirname, basename

        from corpkit.process import determine_datatype

        # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which
        # one is determined automatically below, and processed accordingly. We
        # assume it is a full corpus to begin with.

        self.data = None

        level = kwargs.pop('level', 'c')
        self.datatype = kwargs.pop('datatype', None)
        print_info = kwargs.get('print_info', True)

        if isinstance(path, (list, Datalist)):
            self.path = abspath(dirname(path[0].path.rstrip('/')))
            self.name = basename(self.path)
            self.data = path
        elif isinstance(path, STRINGTYPE):
            self.path = abspath(path)
            self.name = basename(path)
        elif hasattr(path, 'path') and path.path:
            self.path = abspath(path.path)
            self.name = basename(path.path)
        # this messy code figures out as quickly as possible what the datatype
        # and singlefile status of the path is. it's messy because it shortcuts
        # full checking where possible some of the shortcutting could maybe be
        # moved into the determine_datatype() funct.

        self.singlefile = False
        if os.path.isfile(self.path):
            if self.path.endswith('.xml'):
                self.datatype = 'parse'
            self.singlefile = True
        else:
            if not isdir(self.path):
                if isdir(join('data', path)):
                    self.path = abspath(join('data', path))
        if self.path.endswith('-parsed'):
            self.datatype = 'parse'
            if len([d for d in os.listdir(self.path)
                    if isdir(join(self.path, d))]) > 0:
                self.singlefile = False
            if len([d for d in os.listdir(self.path)
                    if isdir(join(self.path, d))]) == 0:
                level = 's'
        else:
            if level == 'c':
                if not self.datatype:
                    self.datatype, self.singlefile = determine_datatype(
                        self.path)
            if isdir(self.path):
                if len([d for d in os.listdir(self.path)
                        if isdir(join(self.path, d))]) == 0:
                    level = 's'

        # if initialised on a file, process as file
        if self.singlefile and level == 'c':
            level = 'f'

        self.level = level

        # load each interrogation as an attribute
        if kwargs.get('load_saved', False):
            from corpkit.other import load
            from corpkit.process import makesafe
            if os.path.isdir('saved_interrogations'):
                saved_files = glob.glob(r'saved_interrogations/*')
                for filepath in saved_files:
                    filename = os.path.basename(filepath)
                    if not filename.startswith(self.name):
                        continue
                    not_filename = filename.replace(self.name + '-', '')
                    not_filename = os.path.splitext(not_filename)[0]
                    if not_filename in ['features', 'wordclasses', 'postags']:
                        continue
                    variable_safe = makesafe(not_filename)
                    try:
                        setattr(self, variable_safe, load(filename))
                        if print_info:
                            print(
                                '\tLoaded %s as %s attribute.' %
                                (filename, variable_safe))
                    except AttributeError:
                        if print_info:
                            print(
                                '\tFailed to load %s as %s attribute. Name conflict?' %
                                (filename, variable_safe))

        if print_info:
            print('Corpus: %s' % self.path)