예제 #1
0
class bibloTrain():
	def __init__(self, bilboOptions, dirCorpus, testPercentage, numberOfPartition=10, prefix=''):
		self.bilboOptions = bilboOptions
		self.bilboOptions.T = True
		self.bilboOptions.L = False
		self.bilboOptions.t = 'bibl'
		#self.bilboOptions.k = 'all'
		#print self.bilboOptions
		self.partitions = Partition(dirCorpus, testPercentage, numberOfPartition, prefix)
		self.dirPartitions = self.partitions.getDirPartitionNames()
	
	def train(self):
		for dirPartition in self.dirPartitions:
			print "dirPartition", dirPartition
			(annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition)
			
			self._del_tmp_file(trainDir) # tmp file of test data are here
			bilbo = Bilbo(modelDir, self.bilboOptions, "crf_model_simple") # tmpFiles saved in modelDir if -k all
			bilbo.train(trainDir, modelDir, 1)

	def _del_tmp_file(self, resultDir):
		pattern = os.path.join(resultDir,'tmp*')
		tmpDirs = glob.glob(pattern)
		for tmpDir in tmpDirs:
			shutil.rmtree(tmpDir)
예제 #2
0
class bilboEval():
	def __init__(self, dirCorpus, testPercentage, numberOfPartition=10, prefix=''):
		self.partitions = Partition(dirCorpus, testPercentage, numberOfPartition, prefix)
		self.dirPartitions = self.partitions.getDirPartitionNames()

	def eval(self):
		allValues = []
		for dirPartition in self.dirPartitions:
			#print "dirPartition", dirPartition
			(annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition)
			
			labeledContent = self._getFile(resultDir, 'testEstCRF_Wapiti.txt')
			desiredContent = self._getFile(trainDir, 'evaldata_CRF_Wapiti.txt') # tmpFiles from training of testDir are saved in trainDir !

			# harmonize the two lists, they are not tokenized the same way
			desiredContentHarmonized, labeledContentHarmonized = prepareEval.prepareEval(desiredContent, labeledContent)

			self._saveFile(labeledContentHarmonized, resultDir, 'annotatedEval.txt')
			self._saveFile(desiredContentHarmonized, resultDir, 'desiredEval.txt')
			
			evalText, labels, values = TokenAccuracyEval.evaluate(labeledContentHarmonized, desiredContentHarmonized)
			allValues.append(values)
			self._saveFile(evalText, dirPartition, 'evaluation.txt')
		
		# calculate average of results for all partitions
		average = [float(sum(col))/len(col) for col in zip(*allValues)]
		allValues.append(average)
		
		# print all results and average on the last line
		finalEval = "\t".join(labels) + "\n"
		finalEval += "\n".join(["\t".join(['{:f}'.format(v) for v in values]) for values in allValues])
		self._saveFile(finalEval, self.partitions.getDirPercentName(), 'evaluation.tsv')

	def _getFile(self, fileDir, pattern):
		pattern = os.path.join(fileDir,'tmp*', pattern)
		files = glob.glob(pattern)
		with open(files[0], 'r', encoding='utf-8') as content_file:
			content = content_file.read()
			return content

	def _saveFile(self, content, dirName, fileName):
		fileName = os.path.join(dirName, fileName)
		with open(fileName, 'w', encoding='utf-8') as content_file:
			content_file.write(content)
예제 #3
0
class bilboAnnotate():
	def __init__(self, bilboOptions, dirCorpus, testPercentage, numberOfPartition=10, prefix=''):
		self.bilboOptions = bilboOptions
		self.bilboOptions.L = True
		self.bilboOptions.T = False
		self.bilboOptions.t = 'bibl'
		self.bilboOptions.k = 'all'
		self.bilboOptions.o = 'simple'
		#print self.bilboOptions
		
		self.partitions = Partition(dirCorpus, testPercentage, numberOfPartition, prefix)
		self.dirPartitions = self.partitions.getDirPartitionNames()

	def annotate(self):
		for dirPartition in self.dirPartitions:
			(annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition)
			
			# annotation of test data striped tagged
			self._setBilboAnnotate()
			self._del_tmp_file(resultDir)
			bilbo = Bilbo(resultDir, self.bilboOptions, "crf_model_simple")
			bilbo.annotate(annotateDir, modelDir, 1)
			
			# train with test data for evaluation
			self._setBilboTrain()
			self._del_tmp_file(trainDir)
			bilbo = Bilbo(trainDir, self.bilboOptions, "crf_model_simple") # To save tmpFiles in testDir
			corpus = Corpus(testDir, self.bilboOptions)
			corpus.extract(1, "bibl")
			bilbo.crf.prepareTrain(corpus, 1, "evaldata_CRF.txt", 1, 1) #CRF training data extraction

	def _setBilboAnnotate(self):
		self.bilboOptions.L = True
		self.bilboOptions.T = False
	
	def _setBilboTrain(self):
		self.bilboOptions.L = False
		self.bilboOptions.T = True

	def _del_tmp_file(self, resultDir):
		pattern = os.path.join(resultDir,'tmp*')
		tmpDirs = glob.glob(pattern)
		for tmpDir in tmpDirs:
			shutil.rmtree(tmpDir)
예제 #4
0
파일: bilboEval.py 프로젝트: ansdma/bilbo
class bilboEval():
	def __init__(self, dirCorpus, testPercentage, numberOfPartition=10, prefix=''):
		self.partitions = Partition(dirCorpus, testPercentage, numberOfPartition, prefix)
		self.dirPartitions = self.partitions.getDirPartitionNames()

	def eval(self):
		allValues = []
		for dirPartition in self.dirPartitions:
			#print "dirPartition", dirPartition
			(annotateDir, testDir, trainDir, modelDir, resultDir) = self.partitions.getDirTestNames(dirPartition)
			
			testEstCRF = self._getFile(resultDir, 'testEstCRF_Wapiti.txt')
			testEstCRFFormated = self._formatEval(testEstCRF)
			##print testEstCRFFormated
			
			desiredResult = self._getFile(trainDir, 'evaldata_CRF_Wapiti.txt')
			desiredResultFormated = self._formatEval(desiredResult)
			#print desiredResultFormated

			desiredResultFormated, testEstCRFFormated = self._harmonizeList(desiredResultFormated, testEstCRFFormated)

			testEstCRFFormated = "\n".join(testEstCRFFormated)
			desiredResultFormated = "\n".join(desiredResultFormated)
			self._saveFile(testEstCRFFormated, resultDir, 'annotatedEval.txt')
			self._saveFile(desiredResultFormated, resultDir, 'desiredEval.txt')
			
			evalText, labels, values = TokenAccuracyEval.evaluate(testEstCRFFormated, desiredResultFormated)
			allValues.append(values)
			self._saveFile(evalText, dirPartition, 'evaluation.txt')
		
		average = [float(sum(col))/len(col) for col in zip(*allValues)]
		allValues.append(average)
		
		finalEval = "\t".join(labels) + "\n"
		finalEval += "\n".join(["\t".join(['{:f}'.format(v) for v in values]) for values in allValues])
		self._saveFile(finalEval, self.partitions.getDirPercentName(), 'evaluation.tsv')

	def _getFeatureAndName(self, token):
		words = token.split("\t")
		feature = words[0]
		name = words[1] if len(words) > 1 else ""
		if len(name.split()) > 1:
			name = "".join(name.split()) # kind of a bug: get rid of non printing utf-8 characters
		return feature, name

	# output is not the same length, before debug, dirty solution to harmonise output
	def _harmonizeList(self, shortList, longList):
		indexLong = 0
		indexShort = 0
		lengthShort = len(shortList)
		lengthLong = len(longList)
		newShortList = []
		newLongList = []
		while True:
			featureShort, partShort = self._getFeatureAndName(shortList[indexShort])
			featureLong , partLong = self._getFeatureAndName(longList[indexLong])
			
			while True:
				if partShort == partLong:
					#print indexShort, partShort.encode('utf8'), indexLong, partLong.encode('utf8'), "RESOLVED"
					break
				#print indexShort, partShort.encode('utf8'), len(partShort), indexLong, partLong.encode('utf8'), len(partLong)
				if partShort < partLong:
					indexShort +=1
					_, partShortAppend = self._getFeatureAndName(shortList[indexShort])
					partShort += partShortAppend
				else:
					indexLong +=1
					_, partLongAppend  = self._getFeatureAndName(longList[indexLong])
					partLong += partLongAppend

			textShort = featureShort + "\t" + partShort if partShort else ''
			newShortList.append(textShort)
			textLong = featureLong + "\t" + partLong if partLong else ''
			newLongList.append(textLong)
			
			indexShort += 1
			indexLong += 1
			
			if indexShort == lengthShort or indexLong == lengthLong:
				break
		
		#print str(len(newShortList)), indexShort, lengthShort, str(len(newLongList)), indexLong, lengthLong
		return newShortList, newLongList

	def _getFile(self, fileDir, pattern):
		pattern = os.path.join(fileDir,'tmp*', pattern)
		files = glob.glob(pattern)
		with open(files[0], 'r', encoding='utf-8') as content_file:
			content = content_file.read()
			return content

	def _saveFile(self, content, dirName, fileName):
		fileName = os.path.join(dirName, fileName)
		with open(fileName, 'w', encoding='utf-8') as content_file:
			content_file.write(content)

	def _formatEval(self, content):
		formated = []
		for line in content.split("\n"):
			words = line.split(" ")
			#print words
			if len(words)>1:
				formated.append(words[-1].strip() + "\t" + words[0].strip())
				#formated.append(words[0])
			else:
				formated.append('')
		return formated