class bibloTrain(): def __init__(self, bilboOptions, dirCorpus, testPercentage, numberOfPartition=10, prefix=''): self.bilboOptions = bilboOptions self.bilboOptions.T = True self.bilboOptions.L = False self.bilboOptions.t = 'bibl' #self.bilboOptions.k = 'all' #print self.bilboOptions self.partitions = Partition(dirCorpus, testPercentage, numberOfPartition, prefix) self.dirPartitions = self.partitions.getDirPartitionNames() def train(self): for dirPartition in self.dirPartitions: print "dirPartition", dirPartition (annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition) self._del_tmp_file(trainDir) # tmp file of test data are here bilbo = Bilbo(modelDir, self.bilboOptions, "crf_model_simple") # tmpFiles saved in modelDir if -k all bilbo.train(trainDir, modelDir, 1) def _del_tmp_file(self, resultDir): pattern = os.path.join(resultDir,'tmp*') tmpDirs = glob.glob(pattern) for tmpDir in tmpDirs: shutil.rmtree(tmpDir)
class bilboEval(): def __init__(self, dirCorpus, testPercentage, numberOfPartition=10, prefix=''): self.partitions = Partition(dirCorpus, testPercentage, numberOfPartition, prefix) self.dirPartitions = self.partitions.getDirPartitionNames() def eval(self): allValues = [] for dirPartition in self.dirPartitions: #print "dirPartition", dirPartition (annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition) labeledContent = self._getFile(resultDir, 'testEstCRF_Wapiti.txt') desiredContent = self._getFile(trainDir, 'evaldata_CRF_Wapiti.txt') # tmpFiles from training of testDir are saved in trainDir ! # harmonize the two lists, they are not tokenized the same way desiredContentHarmonized, labeledContentHarmonized = prepareEval.prepareEval(desiredContent, labeledContent) self._saveFile(labeledContentHarmonized, resultDir, 'annotatedEval.txt') self._saveFile(desiredContentHarmonized, resultDir, 'desiredEval.txt') evalText, labels, values = TokenAccuracyEval.evaluate(labeledContentHarmonized, desiredContentHarmonized) allValues.append(values) self._saveFile(evalText, dirPartition, 'evaluation.txt') # calculate average of results for all partitions average = [float(sum(col))/len(col) for col in zip(*allValues)] allValues.append(average) # print all results and average on the last line finalEval = "\t".join(labels) + "\n" finalEval += "\n".join(["\t".join(['{:f}'.format(v) for v in values]) for values in allValues]) self._saveFile(finalEval, self.partitions.getDirPercentName(), 'evaluation.tsv') def _getFile(self, fileDir, pattern): pattern = os.path.join(fileDir,'tmp*', pattern) files = glob.glob(pattern) with open(files[0], 'r', encoding='utf-8') as content_file: content = content_file.read() return content def _saveFile(self, content, dirName, fileName): fileName = os.path.join(dirName, fileName) with open(fileName, 'w', encoding='utf-8') as content_file: content_file.write(content)
class bilboAnnotate(): def __init__(self, bilboOptions, dirCorpus, testPercentage, numberOfPartition=10, prefix=''): self.bilboOptions = bilboOptions self.bilboOptions.L = True self.bilboOptions.T = False self.bilboOptions.t = 'bibl' self.bilboOptions.k = 'all' self.bilboOptions.o = 'simple' #print self.bilboOptions self.partitions = Partition(dirCorpus, testPercentage, numberOfPartition, prefix) self.dirPartitions = self.partitions.getDirPartitionNames() def annotate(self): for dirPartition in self.dirPartitions: (annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition) # annotation of test data striped tagged self._setBilboAnnotate() self._del_tmp_file(resultDir) bilbo = Bilbo(resultDir, self.bilboOptions, "crf_model_simple") bilbo.annotate(annotateDir, modelDir, 1) # train with test data for evaluation self._setBilboTrain() self._del_tmp_file(trainDir) bilbo = Bilbo(trainDir, self.bilboOptions, "crf_model_simple") # To save tmpFiles in testDir corpus = Corpus(testDir, self.bilboOptions) corpus.extract(1, "bibl") bilbo.crf.prepareTrain(corpus, 1, "evaldata_CRF.txt", 1, 1) #CRF training data extraction def _setBilboAnnotate(self): self.bilboOptions.L = True self.bilboOptions.T = False def _setBilboTrain(self): self.bilboOptions.L = False self.bilboOptions.T = True def _del_tmp_file(self, resultDir): pattern = os.path.join(resultDir,'tmp*') tmpDirs = glob.glob(pattern) for tmpDir in tmpDirs: shutil.rmtree(tmpDir)
class bilboEval(): def __init__(self, dirCorpus, testPercentage, numberOfPartition=10, prefix=''): self.partitions = Partition(dirCorpus, testPercentage, numberOfPartition, prefix) self.dirPartitions = self.partitions.getDirPartitionNames() def eval(self): allValues = [] for dirPartition in self.dirPartitions: #print "dirPartition", dirPartition (annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition) testEstCRF = self._getFile(resultDir, 'testEstCRF_Wapiti.txt') testEstCRFFormated = self._formatEval(testEstCRF) ##print testEstCRFFormated desiredResult = self._getFile(trainDir, 'evaldata_CRF_Wapiti.txt') desiredResultFormated = self._formatEval(desiredResult) #print desiredResultFormated desiredResultFormated, testEstCRFFormated = self._harmonizeList(desiredResultFormated, testEstCRFFormated) testEstCRFFormated = "\n".join(testEstCRFFormated) desiredResultFormated = "\n".join(desiredResultFormated) self._saveFile(testEstCRFFormated, resultDir, 'annotatedEval.txt') self._saveFile(desiredResultFormated, resultDir, 'desiredEval.txt') evalText, labels, values = TokenAccuracyEval.evaluate(testEstCRFFormated, desiredResultFormated) allValues.append(values) self._saveFile(evalText, dirPartition, 'evaluation.txt') average = [float(sum(col))/len(col) for col in zip(*allValues)] allValues.append(average) finalEval = "\t".join(labels) + "\n" finalEval += "\n".join(["\t".join(['{:f}'.format(v) for v in values]) for values in allValues]) self._saveFile(finalEval, self.partitions.getDirPercentName(), 'evaluation.tsv') def _getFeatureAndName(self, token): words = token.split("\t") feature = words[0] name = words[1] if len(words) > 1 else "" if len(name.split()) > 1: name = "".join(name.split()) # kind of a bug: get rid of non printing utf-8 characters return feature, name # output is not the same length, before debug, dirty solution to harmonise output def _harmonizeList(self, shortList, longList): indexLong = 0 indexShort = 0 lengthShort = len(shortList) lengthLong = len(longList) newShortList = [] newLongList = [] while True: featureShort, partShort = self._getFeatureAndName(shortList[indexShort]) featureLong , partLong = self._getFeatureAndName(longList[indexLong]) while True: if partShort == partLong: #print indexShort, partShort.encode('utf8'), indexLong, partLong.encode('utf8'), "RESOLVED" break #print indexShort, partShort.encode('utf8'), len(partShort), indexLong, partLong.encode('utf8'), len(partLong) if partShort < partLong: indexShort +=1 _, partShortAppend = self._getFeatureAndName(shortList[indexShort]) partShort += partShortAppend else: indexLong +=1 _, partLongAppend = self._getFeatureAndName(longList[indexLong]) partLong += partLongAppend textShort = featureShort + "\t" + partShort if partShort else '' newShortList.append(textShort) textLong = featureLong + "\t" + partLong if partLong else '' newLongList.append(textLong) indexShort += 1 indexLong += 1 if indexShort == lengthShort or indexLong == lengthLong: break #print str(len(newShortList)), indexShort, lengthShort, str(len(newLongList)), indexLong, lengthLong return newShortList, newLongList def _getFile(self, fileDir, pattern): pattern = os.path.join(fileDir,'tmp*', pattern) files = glob.glob(pattern) with open(files[0], 'r', encoding='utf-8') as content_file: content = content_file.read() return content def _saveFile(self, content, dirName, fileName): fileName = os.path.join(dirName, fileName) with open(fileName, 'w', encoding='utf-8') as content_file: content_file.write(content) def _formatEval(self, content): formated = [] for line in content.split("\n"): words = line.split(" ") #print words if len(words)>1: formated.append(words[-1].strip() + "\t" + words[0].strip()) #formated.append(words[0]) else: formated.append('') return formated