Пример #1
0
    def __init__(self, *args):
        ###Defaults###

        cpuCount = multiprocessing.cpu_count()
        self.maxParallelCoreCount = int(ceil(float(cpuCount)/2)) if cpuCount <= 8\
                                    else int(ceil(0.75*cpuCount)) #Core count ranges from 1 to ceil(num_of_cores/2), if core count <= 8,
        #else is approx. or exactly 3/4 of the total CPU count.
        self.extractorDictionary = {'text': gfe(), 'html': gfe()}
        self.documentPaths = []
        self.extractorSelector = None
        self.isParallel = True

        self.matrixDict = OrderedDict()
        self.svms = None
        self.dTrees = None
        self.naiveBayes = None

        ###Dependency checks###

        if not (downloadNLTKData('punkt') and downloadNLTKData('cmudict')
                and downloadNLTKData('wordnet')):
            raise RuntimeError(
                "\n\nCould not download the required nltk dependencies.\n")

        ###User arguments###
        #Text must be delimited by semi-colon, in
        #each file passed into the program
        options, extras = getopt.getopt(args, 'd:p:',
                                        ['documentlist=', 'parallel='])

        for opt, arg in options:
            path = normpath(arg)

            if opt in ('-d', '--documentlist'):
                documentListString = readFromFile(path)

                for ch in ('\n', '\t', ' '):  #Removes unnecessary characters
                    if ch in documentListString:
                        documentListString = documentListString.replace(ch, '')

                self.documentPaths = self._getDocumentPaths(documentListString)

            if opt in ('-p', '--parallel'):
                if isinstance(arg, basestring) and len(arg) == 1:
                    option = int(arg)

                    if option == 0:
                        self.isParallel = False
                    elif option == 1:
                        self.isParallel = True

        self.extractorSelector = self._createExtractor(
            self.extractorDictionary)
Пример #2
0
        def __init__(self, *args):
                ###Defaults###

                cpuCount = multiprocessing.cpu_count()
                self.maxParallelCoreCount = int(ceil(float(cpuCount)/2)) if cpuCount <= 8\
                                            else int(ceil(0.75*cpuCount)) #Core count ranges from 1 to ceil(num_of_cores/2), if core count <= 8,
                                                                                #else is approx. or exactly 3/4 of the total CPU count.
                self.extractorDictionary = {'text':gfe(), 'html':gfe()}
                self.documentPaths = []
                self.extractorSelector = None
                self.isParallel = True

                self.matrixDict = OrderedDict()
                self.svms = None
                self.dTrees = None
                self.naiveBayes = None

                ###Dependency checks###

                if not (downloadNLTKData('punkt') and downloadNLTKData('cmudict')
                        and downloadNLTKData('wordnet')):
                        raise RuntimeError("\n\nCould not download the required nltk dependencies.\n")                

                ###User arguments###
                #Text must be delimited by semi-colon, in 
                #each file passed into the program
                options, extras = getopt.getopt(args, 'd:p:', ['documentlist=', 'parallel='])
                
                for opt, arg in options:
                        path = normpath(arg)
                
                        if opt in ('-d', '--documentlist'):
                                documentListString = readFromFile(path)
                                
                                for ch in ('\n', '\t', ' '): #Removes unnecessary characters
                                    if ch in documentListString:
                                        documentListString = documentListString.replace(ch, '')

                                self.documentPaths = self._getDocumentPaths(documentListString)
                                
                        if opt in ('-p', '--parallel'):
                                if isinstance(arg, basestring) and len(arg) == 1:
                                    option = int(arg)
                                    
                                    if option == 0:
                                        self.isParallel = False
                                    elif option == 1:
                                        self.isParallel = True
                                
        
                self.extractorSelector = self._createExtractor(self.extractorDictionary)
Пример #3
0
 def __init__(self, extractorDict):
         for category in extractorDict:
                 extractor = extractorDict[category]
                 if isinstance(extractor, be):
                         self.categoryDictionary[category] = set(extractor.indicators)
                 else:
                         self.categoryDictionary[category] = set()
         self.extractorDictionary = extractorDict
         self.defaultExtractor = gfe()