Пример #1
0
    def verifysettings(self):
        super().verifysettings()


        if 'class' not in self.settings:
            self.settings['class'] = 'nonworderror'

        if 'runonclass' not in self.settings:
            self.settings['runonclass'] = 'runonerror'
        if 'runon' not in self.settings:
            self.settings['runon'] = True

        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2
        if 'maxdistance_short' not in self.settings:
            self.settings['maxdistance_short'] = 1
        if 'maxlength' not in self.settings:
            self.settings['maxlength'] = 25 #longer words will be ignored
        if 'minlength' not in self.settings:
            self.settings['minlength'] = 5 #shorter word will be ignored
        if 'shortlength' not in self.settings:
            self.settings['shortlength'] = self.settings['minlength']
        if 'maxnrclosest' not in self.settings:
            self.settings['maxnrclosest'] = 5


        if 'suffixes' not in self.settings:
            self.settings['suffixes'] = []
        if 'prefixes' not in self.settings:
            self.settings['prefixes'] = []

        self.cache = getcache(self.settings, 1000) #2nd arg is default cache size
Пример #2
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = 0.9
        else:
            self.threshold = self.settings['threshold']

        if 'freqthreshold' not in self.settings:
            self.freqthreshold = 2
        else:
            self.freqthreshold = self.settings['freqthreshold']

        if 'minlength' not in self.settings:
            self.minlength = 5
        else:
            self.minlength = self.settings['minlength']

        if 'probfactor' not in self.settings:
            self.probfactor = 10
        else:
            self.probfactor = self.settings['probfactor']


        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self, self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("First model must be a TIMBL instance base model, which must have the extension '.ibase', got " + modelfile + " instead")
            if len(self.models) > 1:
                lexiconfile = self.models[1]
                if not lexiconfile.endswith("colibri.patternmodel"):
                    raise Exception("Second model must be a Colibri pattern model, which must have the extensions '.colibri.patternmodel', got " + modelfile + " instead")
        except:
            raise Exception("Expected one or two models, the first a TIMBL instance base, and the optional second a colibri patternmodel, got " + str(len(self.models)) )
Пример #3
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = 0.9
        else:
            self.threshold = self.settings['threshold']

        if 'freqthreshold' not in self.settings:
            self.freqthreshold = 2
        else:
            self.freqthreshold = self.settings['freqthreshold']

        if 'minlength' not in self.settings:
            self.minlength = 5
        else:
            self.minlength = self.settings['minlength']

        if 'probfactor' not in self.settings:
            self.probfactor = 10
        else:
            self.probfactor = self.settings['probfactor']


        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self, self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("First model must be a TIMBL instance base model, which must have the extension '.ibase', got " + modelfile + " instead")
            if len(self.models) > 1:
                lexiconfile = self.models[1]
                if not lexiconfile.endswith("colibri.patternmodel"):
                    raise Exception("Second model must be a Colibri pattern model, which must have the extensions '.colibri.patternmodel', got " + modelfile + " instead")
        except:
            raise Exception("Expected one or two models, the first a TIMBL instance base, and the optional second a colibri patternmodel, got " + str(len(self.models)) )
Пример #4
0
    def verifysettings(self):
        super().verifysettings()

        if 'class' not in self.settings:
            self.settings['class'] = 'nonworderror'

        if 'runonclass' not in self.settings:
            self.settings['runonclass'] = 'runonerror'
        if 'runon' not in self.settings:
            self.settings['runon'] = True

        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2
        if 'maxdistance_short' not in self.settings:
            self.settings['maxdistance_short'] = 1
        if 'maxlength' not in self.settings:
            self.settings['maxlength'] = 25  #longer words will be ignored
        if 'minlength' not in self.settings:
            self.settings['minlength'] = 5  #shorter word will be ignored
        if 'shortlength' not in self.settings:
            self.settings['shortlength'] = self.settings['minlength']
        if 'maxnrclosest' not in self.settings:
            self.settings['maxnrclosest'] = 5

        if 'suffixes' not in self.settings:
            self.settings['suffixes'] = []
        if 'prefixes' not in self.settings:
            self.settings['prefixes'] = []

        self.cache = getcache(self.settings,
                              1000)  #2nd arg is default cache size
Пример #5
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'nonworderror'

        super().verifysettings()

        if 'delimiter' not in self.settings:
            self.settings['delimiter'] = "\t"
        elif self.settings['delimiter'].lower() == 'space':
            self.settings['delimiter'] = " "
        elif self.settings['delimiter'].lower() == 'tab':
            self.settings['delimiter'] = "\t"
        elif self.settings['delimiter'].lower() == 'comma':
            self.settings['delimiter'] = ","
        if 'reversedformat' not in self.settings:  #reverse format has (word,freq) pairs rather than (freq,word) pairs
            self.settings['reversedformat'] = False

        if 'ordered ' not in self.settings:
            self.settings[
                'ordered'] = True  #Model file is ordered in descending frequency

        if 'freqthreshold' not in self.settings:
            self.settings['freqthreshold'] = 100
        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2
        if 'maxdistance_short' not in self.settings:
            self.settings['maxdistance_short'] = 1
        if 'maxlength' not in self.settings:
            self.settings['maxlength'] = 25  #longer words will be ignored
        if 'minlength' not in self.settings:
            self.settings['minlength'] = 5  #shorter word will be ignored
        if 'shortlength' not in self.settings:
            self.settings['shortlength'] = self.settings['minlength']
        if 'minfreqthreshold' not in self.settings:
            self.settings['minfreqthreshold'] = 10000
        if 'freqfactor' not in self.settings:
            self.settings['freqfactor'] = 10000
        if 'maxnrclosest' not in self.settings:
            self.settings['maxnrclosest'] = 5

        self.cache = getcache(self.settings,
                              1000)  #2nd arg is default cache size

        if 'suffixes' not in self.settings:
            self.settings['suffixes'] = []
        if 'prefixes' not in self.settings:
            self.settings['prefixes'] = []
Пример #6
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'nonworderror'

        super().verifysettings()

        if 'delimiter' not in self.settings:
            self.settings['delimiter'] = "\t"
        elif self.settings['delimiter'].lower() == 'space':
            self.settings['delimiter'] = " "
        elif self.settings['delimiter'].lower() == 'tab':
            self.settings['delimiter'] = "\t"
        elif self.settings['delimiter'].lower() == 'comma':
            self.settings['delimiter'] = ","
        if 'reversedformat' not in self.settings: #reverse format has (word,freq) pairs rather than (freq,word) pairs
            self.settings['reversedformat'] = False

        if 'ordered ' not in self.settings:
            self.settings['ordered'] = True #Model file is ordered in descending frequency


        if 'freqthreshold' not in self.settings:
            self.settings['freqthreshold'] = 100
        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2
        if 'maxdistance_short' not in self.settings:
            self.settings['maxdistance_short'] = 1
        if 'maxlength' not in self.settings:
            self.settings['maxlength'] = 25 #longer words will be ignored
        if 'minlength' not in self.settings:
            self.settings['minlength'] = 5 #shorter word will be ignored
        if 'shortlength' not in self.settings:
            self.settings['shortlength'] = self.settings['minlength']
        if 'minfreqthreshold' not in self.settings:
            self.settings['minfreqthreshold'] = 10000
        if 'freqfactor' not in self.settings:
            self.settings['freqfactor'] = 10000
        if 'maxnrclosest' not in self.settings:
            self.settings['maxnrclosest'] = 5

        self.cache = getcache(self.settings, 1000) #2nd arg is default cache size

        if 'suffixes' not in self.settings:
            self.settings['suffixes'] = []
        if 'prefixes' not in self.settings:
            self.settings['prefixes'] = []
Пример #7
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'contexterror'

        super().verifysettings()

        if 'algorithm' not in self.settings:
            self.settings['algorithm'] = 1

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        if 'threshold' not in self.settings:
            self.threshold = self.settings['threshold']
        else:
            self.threshold = 0.9

        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
            if not modelfile.endswith(".ibase"):
                raise Exception("TIMBL models must have the extension ibase, got " + modelfile + " instead")
        except:
            raise Exception("Expected one model, got 0 or more")
Пример #8
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'contexterror'

        super().verifysettings()

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        self.maxcontext = max(self.settings['leftcontext'], self.settings['rightcontext'])

        if 'freqthreshold' not in self.settings:
            self.threshold = 25

        if 'threshold' not in self.settings:
            self.threshold = self.settings['threshold']
        else:
            self.threshold = 0.9

        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            modelfile = self.models[0]
        except:
            raise Exception("Expected one model, got 0 or more")
Пример #9
0
    def verifysettings(self):
        if 'class' not in self.settings:
            self.settings['class'] = 'confusion'

        super().verifysettings()

        if 'leftcontext' not in self.settings:
            self.settings['leftcontext'] = 3

        if 'rightcontext' not in self.settings:
            self.settings['rightcontext'] = 3

        self.maxcontext = max(self.settings['leftcontext'], self.settings['rightcontext'])

        if 'freqthreshold' not in self.settings:
            self.freqthreshold = 25

        if 'threshold' not in self.settings:
            self.threshold = self.settings['threshold']
        else:
            self.threshold = 0.9

        if 'maxdistance' not in self.settings:
            self.settings['maxdistance'] = 2


        if 'debug' in self.settings:
            self.debug = bool(self.settings['debug'])
        else:
            self.debug = False


        self.hapaxer = gethapaxer(self, self.settings)

        self.cache = getcache(self.settings, 1000)

        try:
            self.models[0]
        except:
            raise Exception("Expected one model, got 0 or more")