示例#1
0
class bilboEval():
	def __init__(self, dirCorpus, testPercentage, numberOfPartition=10, prefix=''):
		self.partitions = Partition(dirCorpus, testPercentage, numberOfPartition, prefix)
		self.dirPartitions = self.partitions.getDirPartitionNames()

	def eval(self):
		allValues = []
		for dirPartition in self.dirPartitions:
			#print "dirPartition", dirPartition
			(annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition)
			
			labeledContent = self._getFile(resultDir, 'testEstCRF_Wapiti.txt')
			desiredContent = self._getFile(trainDir, 'evaldata_CRF_Wapiti.txt') # tmpFiles from training of testDir are saved in trainDir !

			# harmonize the two lists, they are not tokenized the same way
			desiredContentHarmonized, labeledContentHarmonized = prepareEval.prepareEval(desiredContent, labeledContent)

			self._saveFile(labeledContentHarmonized, resultDir, 'annotatedEval.txt')
			self._saveFile(desiredContentHarmonized, resultDir, 'desiredEval.txt')
			
			evalText, labels, values = TokenAccuracyEval.evaluate(labeledContentHarmonized, desiredContentHarmonized)
			allValues.append(values)
			self._saveFile(evalText, dirPartition, 'evaluation.txt')
		
		# calculate average of results for all partitions
		average = [float(sum(col))/len(col) for col in zip(*allValues)]
		allValues.append(average)
		
		# print all results and average on the last line
		finalEval = "\t".join(labels) + "\n"
		finalEval += "\n".join(["\t".join(['{:f}'.format(v) for v in values]) for values in allValues])
		self._saveFile(finalEval, self.partitions.getDirPercentName(), 'evaluation.tsv')

	def _getFile(self, fileDir, pattern):
		pattern = os.path.join(fileDir,'tmp*', pattern)
		files = glob.glob(pattern)
		with open(files[0], 'r', encoding='utf-8') as content_file:
			content = content_file.read()
			return content

	def _saveFile(self, content, dirName, fileName):
		fileName = os.path.join(dirName, fileName)
		with open(fileName, 'w', encoding='utf-8') as content_file:
			content_file.write(content)
示例#2
0
文件: bilboEval.py 项目: ansdma/bilbo
class bilboEval():
	def __init__(self, dirCorpus, testPercentage, numberOfPartition=10, prefix=''):
		self.partitions = Partition(dirCorpus, testPercentage, numberOfPartition, prefix)
		self.dirPartitions = self.partitions.getDirPartitionNames()

	def eval(self):
		allValues = []
		for dirPartition in self.dirPartitions:
			#print "dirPartition", dirPartition
			(annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition)
			
			testEstCRF = self._getFile(resultDir, 'testEstCRF_Wapiti.txt')
			testEstCRFFormated = self._formatEval(testEstCRF)
			##print testEstCRFFormated
			
			desiredResult = self._getFile(trainDir, 'evaldata_CRF_Wapiti.txt')
			desiredResultFormated = self._formatEval(desiredResult)
			#print desiredResultFormated

			desiredResultFormated, testEstCRFFormated = self._harmonizeList(desiredResultFormated, testEstCRFFormated)

			testEstCRFFormated = "\n".join(testEstCRFFormated)
			desiredResultFormated = "\n".join(desiredResultFormated)
			self._saveFile(testEstCRFFormated, resultDir, 'annotatedEval.txt')
			self._saveFile(desiredResultFormated, resultDir, 'desiredEval.txt')
			
			evalText, labels, values = TokenAccuracyEval.evaluate(testEstCRFFormated, desiredResultFormated)
			allValues.append(values)
			self._saveFile(evalText, dirPartition, 'evaluation.txt')
		
		average = [float(sum(col))/len(col) for col in zip(*allValues)]
		allValues.append(average)
		
		finalEval = "\t".join(labels) + "\n"
		finalEval += "\n".join(["\t".join(['{:f}'.format(v) for v in values]) for values in allValues])
		self._saveFile(finalEval, self.partitions.getDirPercentName(), 'evaluation.tsv')

	def _getFeatureAndName(self, token):
		words = token.split("\t")
		feature = words[0]
		name = words[1] if len(words) > 1 else ""
		if len(name.split()) > 1:
			name = "".join(name.split()) # kind of a bug: get rid of non printing utf-8 characters
		return feature, name

	# output is not the same length, before debug, dirty solution to harmonise output
	def _harmonizeList(self, shortList, longList):
		indexLong = 0
		indexShort = 0
		lengthShort = len(shortList)
		lengthLong = len(longList)
		newShortList = []
		newLongList = []
		while True:
			featureShort, partShort = self._getFeatureAndName(shortList[indexShort])
			featureLong , partLong = self._getFeatureAndName(longList[indexLong])
			
			while True:
				if partShort == partLong:
					#print indexShort, partShort.encode('utf8'), indexLong, partLong.encode('utf8'), "RESOLVED"
					break
				#print indexShort, partShort.encode('utf8'), len(partShort), indexLong, partLong.encode('utf8'), len(partLong)
				if partShort < partLong:
					indexShort +=1
					_, partShortAppend = self._getFeatureAndName(shortList[indexShort])
					partShort += partShortAppend
				else:
					indexLong +=1
					_, partLongAppend  = self._getFeatureAndName(longList[indexLong])
					partLong += partLongAppend

			textShort = featureShort + "\t" + partShort if partShort else ''
			newShortList.append(textShort)
			textLong = featureLong + "\t" + partLong if partLong else ''
			newLongList.append(textLong)
			
			indexShort += 1
			indexLong += 1
			
			if indexShort == lengthShort or indexLong == lengthLong:
				break
		
		#print str(len(newShortList)), indexShort, lengthShort, str(len(newLongList)), indexLong, lengthLong
		return newShortList, newLongList

	def _getFile(self, fileDir, pattern):
		pattern = os.path.join(fileDir,'tmp*', pattern)
		files = glob.glob(pattern)
		with open(files[0], 'r', encoding='utf-8') as content_file:
			content = content_file.read()
			return content

	def _saveFile(self, content, dirName, fileName):
		fileName = os.path.join(dirName, fileName)
		with open(fileName, 'w', encoding='utf-8') as content_file:
			content_file.write(content)

	def _formatEval(self, content):
		formated = []
		for line in content.split("\n"):
			words = line.split(" ")
			#print words
			if len(words)>1:
				formated.append(words[-1].strip() + "\t" + words[0].strip())
				#formated.append(words[0])
			else:
				formated.append('')
		return formated