def train(self, sourcefile, modelfile, **parameters): l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) for ngram in Windower(line, n): focus = ngram[l] leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) classifier.append( leftcontext + rightcontext , focus ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io prevword = "" #buffer = [("<begin>",False,'')] * l buffer = [] with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): if i % 100000 == 0: print( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) words = [w.strip() for w in line.split(' ') if w.strip()] for i, word in enumerate(words): if prevword in TIMBLPuncRecaseModule.PUNCTUATION: punc = prevword else: punc = "" if any(c.isalpha() for c in word): buffer.append( (word, word == word[0].upper() + word[1:].lower(), punc)) if len(buffer) == l + r + 1: buffer = self.addtraininstance(classifier, buffer, l, r) prevword = word #for i in range(0,r): # buffer.append( ("<end>",False,'') ) # if len(buffer) == l + r + 1: # buffer = self.addtraininstance(classifier, buffer,l,r) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io prevword = "" #buffer = [("<begin>",False,'')] * l buffer = [] with iomodule.open(sourcefile,mode='rt',encoding='utf-8',errors='ignore') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) words = [ w.strip() for w in line.split(' ') if w.strip() ] for i, word in enumerate(words): if prevword in PUNCTUATION: punc = prevword else: punc = "" if any( c.isalpha() for c in word ): buffer.append( (word, word == word[0].upper() + word[1:].lower(), punc ) ) if len(buffer) == l + r + 1: buffer = self.addtraininstance(classifier, buffer,l,r) prevword = word #for i in range(0,r): # buffer.append( ("<end>",False,'') ) # if len(buffer) == l + r + 1: # buffer = self.addtraininstance(classifier, buffer,l,r) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): if i % 100000 == 0: print( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) for ngram in Windower(line, n): confusible = ngram[l] if confusible in self.settings['confusibles']: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1:]) classifier.append(leftcontext + rightcontext, confusible) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings["leftcontext"] r = self.settings["rightcontext"] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") # has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode="rt", encoding="utf-8", errors="ignore") as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) for ngram in Windower(line, n): confusible = ngram[l] if confusible in self.settings["confusibles"]: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1 :]) classifier.append(leftcontext + rightcontext, confusible) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def train(self, sourcefile, modelfile, **parameters): if modelfile == self.confusiblefile: #Build frequency list self.log( "Preparing to generate lexicon for suffix confusible module") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( "", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder( classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile) self.log("Generating frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['freqthreshold'], minlength=1, maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Finding confusible pairs") classdecoder = colibricore.ClassDecoder(classfile) self.confusibles = [] #pylint: disable=attribute-defined-outside-init for pattern in model: try: pattern_s = pattern.tostring(classdecoder) except UnicodeDecodeError: self.log( "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!" ) for suffix in self.suffixes: if pattern_s.endswith( suffix) and not pattern_s in self.confusibles: found = [] for othersuffix in self.suffixes: if othersuffix != suffix: otherpattern_s = pattern_s[:-len( suffix)] + othersuffix try: otherpattern = classencoder.buildpattern( otherpattern_s, False, False) except KeyError: if found: found = [] break if not otherpattern in model: if found: found = [] break if self.settings['maxratio'] != 0: freqs = ( model.occurrencecount(pattern), model.occurrencecount(otherpattern)) ratio = max(freqs) / min(freqs) if ratio < self.settings['maxratio']: if found: found = [] break found.append(otherpattern_s) if found: self.confusibles.append(pattern_s) for s in found: self.confusibles.append(s) self.log("Writing confusible list") with open(modelfile, 'w', encoding='utf-8') as f: for confusible in self.confusibles: f.write(confusible + "\n") elif modelfile == self.modelfile: try: self.confusibles except AttributeError: self.confusibles = [] self.log("Loading confusiblefile") with open(self.confusiblefile, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): for ngram in Windower(line, n): if i % 100000 == 0: print(datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) confusible = ngram[l] if confusible in self.confusibles: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1:]) suffix, normalized = self.getsuffix(confusible) if suffix is not None: classifier.append( leftcontext + (normalized, ) + rightcontext, suffix) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def train(self, sourcefile, modelfile, **parameters): if modelfile == self.confusiblefile: #Build frequency list self.log("Preparing to generate lexicon for suffix confusible module") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder("", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) self.log("Generating frequency list") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Finding confusible pairs") classdecoder = colibricore.ClassDecoder(classfile) self.confusibles = [] #pylint: disable=attribute-defined-outside-init for pattern in model: try: pattern_s = pattern.tostring(classdecoder) except UnicodeDecodeError: self.log("WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!") for suffix in self.suffixes: if pattern_s.endswith(suffix) and not pattern_s in self.confusibles: found = [] for othersuffix in self.suffixes: if othersuffix != suffix: otherpattern_s = pattern_s[:-len(suffix)] + othersuffix try: otherpattern = classencoder.buildpattern(otherpattern_s,False,False) except KeyError: if found: found = [] break if not otherpattern in model: if found: found = [] break if self.settings['maxratio'] != 0: freqs = (model.occurrencecount(pattern), model.occurrencecount(otherpattern)) ratio = max(freqs) / min(freqs) if ratio < self.settings['maxratio']: if found: found = [] break found.append(otherpattern_s ) if found: self.confusibles.append(pattern_s) for s in found: self.confusibles.append(s) self.log("Writing confusible list") with open(modelfile,'w',encoding='utf-8') as f: for confusible in self.confusibles: f.write(confusible + "\n") elif modelfile == self.modelfile: if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): for ngram in Windower(line, n): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) confusible = ngram[l] if confusible in self.confusibles: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) suffix, normalized = self.getsuffix(confusible) if suffix is not None: classifier.append( leftcontext + (normalized,) + rightcontext , suffix ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() if modelfile.endswith('.ibase'): l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) for ngram in Windower(line, n): if self.hapaxer: ngram = self.hapaxer(ngram) focus = ngram[l] if self.hapaxer and focus == self.hapaxer.placeholder: continue leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) classifier.append( leftcontext + rightcontext , focus ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() elif modelfile.endswith('.patternmodel'): self.log("Preparing to generate lexicon for Language Model") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder() classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') self.log("Generating pattern model") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model " + modelfile) model.write(modelfile)
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() if modelfile.endswith('.ibase'): l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) for ngram in Windower(line, n): if self.hapaxer: ngram = self.hapaxer(ngram) focus = ngram[l] if self.hapaxer and focus == self.hapaxer.placeholder: continue leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) classifier.append( leftcontext + rightcontext , focus ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() elif modelfile.endswith('.patternmodel'): self.log("Preparing to generate lexicon for Language Model") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder() classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') self.log("Generating pattern model") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model " + modelfile) model.write(modelfile)