def __init__(self, data): from corpus import Corpus # handle a folder containing corpora if type(data) == str or type(data) == str: import os from os.path import join, isfile, isdir if not os.path.isdir(data): raise ValueError('Corpora(str) needs to point to a directory.') data = sorted([join(data, d) for d in os.listdir(data) \ if isdir(join(data, d))]) # otherwise, make a list of Corpus objects for index, i in enumerate(data): if type(i) == str: data[index] = Corpus(i) # now turn it into a Datalist Datalist.__init__(self, data)
def __getitem__(self, key): from process import makesafe if isinstance(key, slice): #Get the start, stop, and step from the slice return Datalist( [self[ii] for ii in range(*key.indices(len(self.subcorpora)))]) elif type(key) == int: return self.subcorpora.__getitem__(makesafe(self.subcorpora[key])) else: try: return self.subcorpora.__getattribute__(key) except: from process import is_number if is_number(key): return self.__getattribute__('c' + key)
def subcorpora(self): """A list-like object containing a corpus' subcorpora.""" import re, os, operator from os.path import join, isdir if self.data.__class__ == Datalist or type(self.data) == list: return self.data if self.level == 'c': variable_safe_r = re.compile('[\W0-9_]+', re.UNICODE) sbs = Datalist(sorted([Subcorpus(join(self.path, d), self.datatype) \ for d in os.listdir(self.path) \ if isdir(join(self.path, d))], \ key=operator.attrgetter('name'))) for subcorpus in sbs: variable_safe = re.sub(variable_safe_r, '', \ subcorpus.name.lower().split(',')[0]) setattr(self, variable_safe, subcorpus) return sbs
def files(self): """A list-like object containing the files in a folder >>> corpus.subcorpora[0].files """ import re, os, operator from os.path import join, isdir if self.level == 's': #variable_safe_r = re.compile('[\W0-9_]+', re.UNICODE) fs = sorted([File(f, self.path, self.datatype) for f in os.listdir(self.path) \ if not f.startswith('.')], key=operator.attrgetter('name')) fs = Datalist(fs) #for f in fs: # variable_safe = re.sub(variable_safe_r, '', f.name.lower().split('.')[0]) # setattr(self, variable_safe, f) return fs