Exemplo n.º 1
0
    def make_filename(interrogation, savename):
        """create a filename"""
        if '/' in savename:
            return savename

        firstpart = ''
        if savename.endswith('.p'):
            savename = savename[:-2]
        savename = makesafe(savename, drop_datatype=False, hyphens_ok=True)
        if not savename.endswith('.p'):
            savename = savename + '.p'
        if hasattr(interrogation, 'query') and isinstance(
                interrogation.query, dict):
            corpus = interrogation.query.get('corpus', False)
            if corpus:
                if isinstance(corpus, STRINGTYPE):
                    firstpart = corpus
                else:
                    if isinstance(corpus, Datalist):
                        firstpart = Corpus(corpus).name
                    if hasattr(corpus, 'name'):
                        firstpart = corpus.name
                    else:
                        firstpart = ''

        firstpart = os.path.basename(firstpart)

        if firstpart:
            return firstpart + '-' + savename
        else:
            return savename
Exemplo n.º 2
0
 def __init__(self, data):
     from corpkit.process import makesafe
     if isinstance(data, list):
         data = OrderedDict(data)
     # attribute access
     for k, v in data.items():
         setattr(self, makesafe(k), v)
     self.query = None
     super(Interrodict, self).__init__(data)
Exemplo n.º 3
0
 def __init__(self, data):
     from corpkit.process import makesafe
     if isinstance(data, list):
         data = OrderedDict(data)
     # attribute access
     for k, v in data.items():
         setattr(self, makesafe(k), v)
     self.query = None
     super(Interrodict, self).__init__(data)
Exemplo n.º 4
0
def load_all_results(data_dir='saved_interrogations', **kwargs):
    """
    Load every saved interrogation in data_dir into a dict:

        >>> r = load_all_results()

    :param data_dir: path to saved data
    :type data_dir: str

    :returns: dict with filenames as keys
    """
    import os
    from time import localtime, strftime
    from other import load
    from process import makesafe

    root = kwargs.get('root', False)
    note = kwargs.get('note', False)

    datafiles = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f)) \
                 and f.endswith('.p')]

    # just load first n (for testing)
    if kwargs.get('n', False):
        datafiles = datafiles[:kwargs['n']]

    output = {}

    l = 0
    for index, f in enumerate(datafiles):
        try:
            loadname = f.replace('.p', '')
            output[loadname] = load(f, loaddir=data_dir)
            time = strftime("%H:%M:%S", localtime())
            print('%s: %s loaded as %s.' % (time, f, makesafe(loadname)))
            l += 1
        except:
            time = strftime("%H:%M:%S", localtime())
            print(
                '%s: %s failed to load. Try using load to find out the matter.'
                % (time, f))
        if note and len(datafiles) > 3:
            note.progvar.set((index + 1) * 100.0 / len(datafiles))
        if root:
            root.update()
    time = strftime("%H:%M:%S", localtime())
    print('%s: %d interrogations loaded from %s.' %
          (time, l, os.path.basename(data_dir)))
    from interrogation import Interrodict
    return Interrodict(output)
Exemplo n.º 5
0
def folderise(folder):
    """
    Move each file into a folder
    """
    import os
    import shutil
    from glob import glob
    from corpkit.process import makesafe
    fs = glob(os.path.join(folder, '*.txt'))
    for f in fs:
        newname = makesafe(os.path.splitext(os.path.basename(f))[0])
        newpath = os.path.join(folder, newname)
        if not os.path.exists(newpath):
            os.makedirs(newpath)
        shutil.move(f, os.path.join(newpath))
Exemplo n.º 6
0
def folderise(folder):
    """
    Move each file into a folder
    """
    import os
    import shutil
    from glob import glob
    from corpkit.process import makesafe
    fs = glob(os.path.join(folder, '*.txt'))
    for f in fs:
        newname = makesafe(os.path.splitext(os.path.basename(f))[0])
        newpath = os.path.join(folder, newname)
        if not os.path.exists(newpath):
            os.makedirs(newpath)
        shutil.move(f, os.path.join(newpath))
Exemplo n.º 7
0
 def __getitem__(self, key):
     """allow slicing, indexing"""
     from corpkit.process import makesafe
     if isinstance( key, slice ) :
         #Get the start, stop, and step from the slice
         return Corpora([self[ii] for ii in xrange(*key.indices(len(self)))])
     elif type(key) == int:
         return self.__getitem__(makesafe(self.data[key]))
     else:
         try:
             return self.__getattribute__(key)
         except:
             from corpkit.process import is_number
             if is_number(key):
                 return self.__getattribute__('c' + key)
Exemplo n.º 8
0
 def __init__(self, data):
     import re
     import os
     from os.path import join, isfile, isdir
     from corpkit.process import makesafe
     self.current = 0
     if data:
         self.high = len(data)
     else:
         self.high = 0
     self.data = data
     if data and len(data) > 0:
         for subcorpus in data:
             safe_var = makesafe(subcorpus)
             setattr(self, safe_var, subcorpus)
Exemplo n.º 9
0
 def __getitem__(self, key):
     from corpkit.process import makesafe
     if isinstance(key, slice):
         # Get the start, stop, and step from the slice
         return Datalist([self[ii] for ii in range(
             *key.indices(len(self.subcorpora)))])
     elif isinstance(key, int):
         return self.subcorpora.__getitem__(makesafe(self.subcorpora[key]))
     else:
         try:
             return self.subcorpora.__getattribute__(key)
         except:
             from corpkit.process import is_number
             if is_number(key):
                 return self.__getattribute__('c' + key)
Exemplo n.º 10
0
 def __init__(self, data):
     from corpkit.process import makesafe
     for k, v in data.items():
         setattr(self, makesafe(k), v)
     dict.__init__(self, data)
Exemplo n.º 11
0
 def __setitem__(self, key, value):
     from corpkit.process import makesafe
     setattr(self, makesafe(key), value)
     super(Interrodict, self).__setitem__(key, value)
Exemplo n.º 12
0
 def __setitem__(self, key, value):
     from corpkit.process import makesafe
     setattr(self, makesafe(key), value)
     super(Interrodict, self).__setitem__(key, value)
Exemplo n.º 13
0
    def __init__(self, path, **kwargs):
        import re
        import operator
        import glob
        import os
        from os.path import join, isfile, isdir, abspath, dirname, basename

        from corpkit.process import determine_datatype

        # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which
        # one is determined automatically below, and processed accordingly. We
        # assume it is a full corpus to begin with.

        self.data = None

        level = kwargs.pop('level', 'c')
        self.datatype = kwargs.pop('datatype', None)
        print_info = kwargs.get('print_info', True)

        if isinstance(path, (list, Datalist)):
            self.path = abspath(dirname(path[0].path.rstrip('/')))
            self.name = basename(self.path)
            self.data = path
        elif isinstance(path, STRINGTYPE):
            self.path = abspath(path)
            self.name = basename(path)
        elif hasattr(path, 'path') and path.path:
            self.path = abspath(path.path)
            self.name = basename(path.path)
        # this messy code figures out as quickly as possible what the datatype
        # and singlefile status of the path is. it's messy because it shortcuts
        # full checking where possible some of the shortcutting could maybe be
        # moved into the determine_datatype() funct.

        self.singlefile = False
        if os.path.isfile(self.path):
            if self.path.endswith('.xml'):
                self.datatype = 'parse'
            self.singlefile = True
        else:
            if not isdir(self.path):
                if isdir(join('data', path)):
                    self.path = abspath(join('data', path))
        if self.path.endswith('-parsed'):
            self.datatype = 'parse'
            if len([d for d in os.listdir(self.path)
                    if isdir(join(self.path, d))]) > 0:
                self.singlefile = False
            if len([d for d in os.listdir(self.path)
                    if isdir(join(self.path, d))]) == 0:
                level = 's'
        else:
            if level == 'c':
                if not self.datatype:
                    self.datatype, self.singlefile = determine_datatype(
                        self.path)
            if isdir(self.path):
                if len([d for d in os.listdir(self.path)
                        if isdir(join(self.path, d))]) == 0:
                    level = 's'

        # if initialised on a file, process as file
        if self.singlefile and level == 'c':
            level = 'f'

        self.level = level

        # load each interrogation as an attribute
        if kwargs.get('load_saved', False):
            from corpkit.other import load
            from corpkit.process import makesafe
            if os.path.isdir('saved_interrogations'):
                saved_files = glob.glob(r'saved_interrogations/*')
                for filepath in saved_files:
                    filename = os.path.basename(filepath)
                    if not filename.startswith(self.name):
                        continue
                    not_filename = filename.replace(self.name + '-', '')
                    not_filename = os.path.splitext(not_filename)[0]
                    if not_filename in ['features', 'wordclasses', 'postags']:
                        continue
                    variable_safe = makesafe(not_filename)
                    try:
                        setattr(self, variable_safe, load(filename))
                        if print_info:
                            print(
                                '\tLoaded %s as %s attribute.' %
                                (filename, variable_safe))
                    except AttributeError:
                        if print_info:
                            print(
                                '\tFailed to load %s as %s attribute. Name conflict?' %
                                (filename, variable_safe))

        if print_info:
            print('Corpus: %s' % self.path)