예제 #1
0
파일: corpus.py 프로젝트: maxdesp/corpkit
    def __init__(self, path, **kwargs):
        import os
        from os.path import join, isfile, isdir, abspath, dirname, basename
        import re
        import operator
        from process import determine_datatype
        from corpus import Datalist

        # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which 
        # one is determined automatically below, and processed accordingly. We 
        # assume it is a full corpus to begin with.

        self.data = None

        level = kwargs.pop('level', 'c')
        self.datatype = kwargs.pop('datatype', None)
        print_info = kwargs.get('print_info', True)

        if path.__class__ == Datalist or type(path) == list:
            self.path = abspath(dirname(path[0].path.rstrip('/')))
            self.name = basename(self.path)
            self.data = path
        else:
            self.path = abspath(path)
            self.name = basename(path)

        # this messy code figures out as quickly as possible what the datatype 
        # and singlefile status of the path is. it's messy because it shortcuts 
        # full checking where possible some of the shortcutting could maybe be 
        # moved into the determine_datatype() funct.

        if print_info:
            print('\nCorpus at: %s' % self.path)

        self.singlefile = False
        if os.path.isfile(self.path):
            if self.path.endswith('.xml'):
                self.datatype = 'parse'
            self.singlefile = True
        elif self.path.endswith('-parsed'):
            self.datatype = 'parse'
            if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) > 0:
                self.singlefile = False
            if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) == 0:
                level = 's'
        else:
            if level == 'c':
                if not self.datatype:
                    self.datatype, self.singlefile = determine_datatype(self.path)
            if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) == 0:
                level = 's'
        
        # if initialised on a file, process as file
        if self.singlefile and level == 'c':
            level = 'f'

        self.level = level
예제 #2
0
파일: corpus.py 프로젝트: nkhuyu/corpkit
    def __init__(self, path, **kwargs):
        import os
        from os.path import join, isfile, isdir
        import re
        import operator
        from process import determine_datatype

        # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which
        # one is determined automatically below, and processed accordingly. We
        # assume it is a full corpus to begin with.

        level = kwargs.pop("level", "c")
        self.datatype = kwargs.pop("datatype", None)
        print_info = kwargs.get("print_info", True)

        self.path = os.path.abspath(path)
        self.name = os.path.basename(path)

        # this messy code figures out as quickly as possible what the datatype
        # and singlefile status of the path is. it's messy because it shortcuts
        # full checking where possible some of the shortcutting could maybe be
        # moved into the determine_datatype() funct.

        if print_info:
            print("\nCorpus at: %s\n" % self.path)

        self.singlefile = False
        if os.path.isfile(self.path):
            if self.path.endswith(".xml"):
                self.datatype = "parse"
            self.singlefile = True
        elif path.endswith("-parsed"):
            self.datatype = "parse"
            if len([d for d in os.listdir(path) if isdir(join(path, d))]) > 0:
                self.singlefile = False
            if len([d for d in os.listdir(path) if isdir(join(path, d))]) == 0:
                level = "s"
        else:
            if level == "c":
                if not self.datatype:
                    self.datatype, self.singlefile = determine_datatype(path)
            if len([d for d in os.listdir(path) if isdir(join(path, d))]) == 0:
                level = "s"

        # if initialised on a file, process as file
        if self.singlefile and level == "c":
            level = "f"

        self.level = level
예제 #3
0
파일: corpus.py 프로젝트: xsongx/corpkit
    def __init__(self, path, **kwargs):
        import os
        from os.path import join, isfile, isdir
        import re
        import operator
        from process import determine_datatype

        # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which 
        # one is determined automatically below, and processed accordingly. We 
        # assume it is a full corpus to begin with.

        level = kwargs.pop('level', 'c')
        print_info = kwargs.get('print_info', True)

        path = os.path.abspath(path)

        self.path = os.path.relpath(path)
        self.name = os.path.basename(path)
        self.abspath = path

        # this messy code figures out as quickly as possible what the datatype 
        # and singlefile status of the path is. it's messy because it shortcuts 
        # full checking where possible some of the shortcutting could maybe be 
        # moved into the determine_datatype() funct.

        self.singlefile = False
        if os.path.isfile(self.abspath):
            if self.abspath.endswith('.xml'):
                self.datatype = 'parse'
            self.singlefile = True
        elif path.endswith('-parsed'):
            self.datatype = 'parse'
            if len([d for d in os.listdir(path) if isdir(join(path, d))]) > 0:
                self.singlefile = False
        else:
            self.datatype, self.singlefile = determine_datatype(path)
            if len([d for d in os.listdir(path) if isdir(join(path, d))]) == 0:
                level = 's'

        self.structure = None
        self.subcorpora = None
        self.files = None

        # these two will become .structure and .files if they exist
        struct = {}
        all_files = []
        
        # if initialised on a file, process as file
        if self.singlefile and level == 'c':
            level = 'f'

        # For corpora, make Datalist of subcorpora, make structure dict, make a
        # Datalist of files, and print useful information
        if level == 'c':
            if print_info:
                print('\nCorpus at: %s\n' % self.abspath)
            subcorpora = Datalist(sorted([Subcorpus(join(self.path, d)) \
                                               for d in os.listdir(self.path) \
                                               if isdir(join(self.path, d))], \
                                               key=operator.attrgetter('name')))
            self.subcorpora = subcorpora
            for sbc in subcorpora:
                
                file_list = [File(f, sbc.path) for f in os.listdir(sbc.path) \
                    if not f.startswith('.')]
                file_list = sorted(file_list, key=operator.attrgetter('name'))
                file_list = Datalist(file_list)
                struct[sbc] = file_list
                if print_info:
                    print('Subcorpus: %s\n\t%s\n' % (sbc.name, \
                        '\n\t'.join([f.name for f in file_list[:10]])))
                    if len(file_list) > 10:
                        print('... and %s more ... \n' % str(len(file_list) - 10))
                for f in file_list:
                    all_files.append(f)
        
            self.structure = struct

        # for subcorpora, we only need the filelist and a simple structure dict
        elif level == 's':
            all_files = sorted([File(f, self.path) for f in os.listdir(self.path) \
                                if not f.startswith('.')], key=operator.attrgetter('name'))
            self.files = Datalist(all_files)
            self.structure = {'.': self.files}
            if print_info:
                print('\nCorpus created with %d files:\n\t%s\n' % (len(self.files), '\n\t'.join([i.name for i in self.files][:10])))
                if len(self.files) > 10:
                    print('... and %s more ... \n' % str(len(self.files) - 10))

        # for non File, we will add files attribute
        if level != 'f':
            self.files = Datalist(all_files)

        # this is the future home of the output of .get_stats()
        self.features = False

        # set accessible attribute names for subcorpora and files
        variable_safe_r = re.compile('[\W0-9_]+', re.UNICODE)
        if self.subcorpora is not None:
            if self.subcorpora and len(self.subcorpora) > 0:
                for subcorpus in self.subcorpora:
                    variable_safe = re.sub(variable_safe_r, '', \
                        subcorpus.name.lower().split(',')[0])
                    setattr(self, variable_safe, subcorpus)
        if self.files is not None:
            if self.files and len(self.files) > 0:
                for f in self.files:
                    variable_safe = re.sub(variable_safe_r, '', f.name.lower().split('.')[0])
                    setattr(self, variable_safe, f)
예제 #4
0
    def __init__(self, path, **kwargs):
        import os
        from os.path import join, isfile, isdir, abspath, dirname, basename
        import re
        import operator
        from process import determine_datatype
        from corpus import Datalist

        # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which
        # one is determined automatically below, and processed accordingly. We
        # assume it is a full corpus to begin with.

        self.data = None

        level = kwargs.pop('level', 'c')
        self.datatype = kwargs.pop('datatype', None)
        print_info = kwargs.get('print_info', True)

        if path.__class__ == Datalist or type(path) == list:
            self.path = abspath(dirname(path[0].path.rstrip('/')))
            self.name = basename(self.path)
            self.data = path
        else:
            self.path = abspath(path)
            self.name = basename(path)

        # this messy code figures out as quickly as possible what the datatype
        # and singlefile status of the path is. it's messy because it shortcuts
        # full checking where possible some of the shortcutting could maybe be
        # moved into the determine_datatype() funct.

        if print_info:
            print('\nCorpus at: %s' % self.path)

        self.singlefile = False
        if os.path.isfile(self.path):
            if self.path.endswith('.xml'):
                self.datatype = 'parse'
            self.singlefile = True
        elif self.path.endswith('-parsed'):
            self.datatype = 'parse'
            if len([
                    d
                    for d in os.listdir(self.path) if isdir(join(self.path, d))
            ]) > 0:
                self.singlefile = False
            if len([
                    d for d in os.listdir(self.path)
                    if isdir(join(self.path, d))
            ]) == 0:
                level = 's'
        else:
            if level == 'c':
                if not self.datatype:
                    self.datatype, self.singlefile = determine_datatype(
                        self.path)
            if len([
                    d for d in os.listdir(self.path)
                    if isdir(join(self.path, d))
            ]) == 0:
                level = 's'

        # if initialised on a file, process as file
        if self.singlefile and level == 'c':
            level = 'f'

        self.level = level