示例#1
0
文件: corpus.py 项目: xsongx/corpkit
    def __init__(self, data):
        from corpus import Corpus
        # handle a folder containing corpora
        if type(data) == str or type(data) == str:
            import os
            from os.path import join, isfile, isdir
            if not os.path.isdir(data):
                raise ValueError('Corpora(str) needs to point to a directory.')
            data = sorted([join(data, d) for d in os.listdir(data) \
                          if isdir(join(data, d))])
        # otherwise, make a list of Corpus objects
        for index, i in enumerate(data):
            if type(i) == str:
                data[index] = Corpus(i)

        # now turn it into a Datalist
        Datalist.__init__(self, data)
示例#2
0
    def __init__(self, data):
        from corpus import Corpus
        # handle a folder containing corpora
        if type(data) == str or type(data) == str:
            import os
            from os.path import join, isfile, isdir
            if not os.path.isdir(data):
                raise ValueError('Corpora(str) needs to point to a directory.')
            data = sorted([join(data, d) for d in os.listdir(data) \
                          if isdir(join(data, d))])
        # otherwise, make a list of Corpus objects
        for index, i in enumerate(data):
            if type(i) == str:
                data[index] = Corpus(i)

        # now turn it into a Datalist
        Datalist.__init__(self, data)
示例#3
0
 def __getitem__(self, key):
     from process import makesafe
     if isinstance(key, slice):
         #Get the start, stop, and step from the slice
         return Datalist(
             [self[ii] for ii in range(*key.indices(len(self.subcorpora)))])
     elif type(key) == int:
         return self.subcorpora.__getitem__(makesafe(self.subcorpora[key]))
     else:
         try:
             return self.subcorpora.__getattribute__(key)
         except:
             from process import is_number
             if is_number(key):
                 return self.__getattribute__('c' + key)
示例#4
0
 def subcorpora(self):
     """A list-like object containing a corpus' subcorpora."""
     import re, os, operator
     from os.path import join, isdir
     if self.data.__class__ == Datalist or type(self.data) == list:
         return self.data
     if self.level == 'c':
         variable_safe_r = re.compile('[\W0-9_]+', re.UNICODE)
         sbs = Datalist(sorted([Subcorpus(join(self.path, d), self.datatype) \
                                    for d in os.listdir(self.path) \
                                    if isdir(join(self.path, d))], \
                                    key=operator.attrgetter('name')))
         for subcorpus in sbs:
             variable_safe = re.sub(variable_safe_r, '', \
                 subcorpus.name.lower().split(',')[0])
             setattr(self, variable_safe, subcorpus)
         return sbs
示例#5
0
    def files(self):
        """A list-like object containing the files in a folder

        >>> corpus.subcorpora[0].files

        """
        import re, os, operator
        from os.path import join, isdir
        if self.level == 's':

            #variable_safe_r = re.compile('[\W0-9_]+', re.UNICODE)
            fs = sorted([File(f, self.path, self.datatype) for f in os.listdir(self.path) \
                        if not f.startswith('.')], key=operator.attrgetter('name'))
            fs = Datalist(fs)
            #for f in fs:
            #    variable_safe = re.sub(variable_safe_r, '', f.name.lower().split('.')[0])
            #    setattr(self, variable_safe, f)
            return fs