Пример #1
0
Файл: CRF.py Проект: Unaah/bilbo
	def __init__(self, dirResult, options={}):
		"""
		Attributes
		----------
		generateXml : GenerateXml
		dirResult : string
			directory for result files
		"""
		self.generateXml = GenerateXml()
		self.dirResult = dirResult
		self.options = options
		self.dirModel = ""
		main = os.path.realpath(__file__).split('/')
		self.rootDir = "/".join(main[:len(main)-4])		
Пример #2
0
Файл: CRF.py Проект: Unaah/bilbo
class CRF(object):
	"""
	CRF object is created in a Bilbo object
	CRF model learning and test
	"""

	def __init__(self, dirResult, options={}):
		"""
		Attributes
		----------
		generateXml : GenerateXml
		dirResult : string
			directory for result files
		"""
		self.generateXml = GenerateXml()
		self.dirResult = dirResult
		self.options = options
		self.dirModel = ""
		main = os.path.realpath(__file__).split('/')
		self.rootDir = "/".join(main[:len(main)-4])		
		
	def setDirModel(self, dirModel):
		self.dirModel = dirModel
		
		
	def prepareTrain(self, corpus, typeCorpus, fileRes, tr=-1, extOption=-1, optsvm=True):
		"""
		Prepare CRF training data
		
		Parameters
		----------
		corpus : Corpus
		typeCorpus : int, {1, 2, 3}
			1 : corpus 1, 2 : corpus 2...
		fileRes : string
			output file name
		tr : int, {1, 0, -1, -2} (default -1)
			check if training or test data
			1 : train, 0 : test without label, -1 : test with label, -2 : test only label 
		extOption : int, {-1, 1, ...} (default -1)
			extra option for crf training/test data format
			check if data is internal data, if yes we'll use a modified index for corpus type 2
			-1 : data format for SVM 
			1 : data format for normal CRF training/test data 
			2-5 : (not yet provided)
		"""	
		listReferences = corpus.getListReferences(typeCorpus)
		newListReferences = ListReferences(listReferences, typeCorpus)
		extractor = Extract_crf(self.options)
		nbRef = corpus.nbReference(typeCorpus)

		'generation of training index for each reference'
		extractor.randomgen(newListReferences, 1)
				
		'if corpus type 2 and extOption=1, we use a modified index list' #!!!!!!!!!!
		if typeCorpus == 2 and extOption == 1:
			'modify the indices to eliminate the reference (or not print the reference) classified as non-bibl BY SVM'
			if optsvm == True : #if not, do not modify
				extractor.extractIndices(self.dirResult+"svm_predictions_training", newListReferences)
			extractor.extract(typeCorpus, nbRef, self.dirResult+fileRes, newListReferences, tr, extOption)
			
		else: # typeCorpus == 1 or (typeCorpus == 2 and isFrstExt == -1)
			########## SOURCE DATA EXTRACTION FOR SVM OR CORPUS 1 (BUT THESE ARE DIFFERENT !!!)
			extractor.extract(typeCorpus, nbRef, self.dirResult+fileRes, newListReferences, tr, extOption)
		
		return
	

	def prepareTest(self, corpus, typeCorpus, indiceSvm = 0):
		"""
		Prepare CRF test data
		
		Parameters
		----------
		corpus : Corpus
		typeCorpus : int, {1, 2, 3}
			1 : corpus 1, 2 : corpus 2...
		indiceSvm : int, {0, -1, 2}
			0 : normal(corpus 1)
			-1 : data04SVM (corpus2),
			2 : external data => svm isn't called
		"""
		listReferences = corpus.getListReferences(typeCorpus)
		listReferencesObj = ListReferences(listReferences, typeCorpus)
		
		extractor = Extract_crf(self.options)
		nbRef = corpus.nbReference(typeCorpus)
		
		'generation of test index for each reference'
		extractor.randomgen(ListReferences(listReferencesObj.getReferences(),typeCorpus), 0)
		
		if indiceSvm == -1:
			extractor.extract(typeCorpus, nbRef, self.dirResult+"data04SVM_ori.txt", ListReferences(listReferencesObj.getReferences(),typeCorpus))
		else: 
			'file for CRF training'
			if typeCorpus == 2 and indiceSvm != 2 :
				extractor.extractIndices4new(self.dirResult+"svm_predictions_new", ListReferences(listReferencesObj.getReferences(),typeCorpus))
			
			extractor.extract(typeCorpus, nbRef, self.dirResult+"testdatawithlabel_CRF.txt",ListReferences(listReferencesObj.getReferences(),typeCorpus), -1, 1)			
			extractor.extract(typeCorpus, nbRef, self.dirResult+"testdata_CRF.txt",ListReferences(listReferencesObj.getReferences(),typeCorpus), 0, 1)

		return ListReferences(listReferencesObj.getReferences(),typeCorpus)
		

	def runTrain(self, directory, fichier, modelname, penalty=0.00001) :
		"""
		Run CRF training module from Wapiti software
		
		Parameters
		----------
		directory : string 
			directory where we save the model
		fichier : string
			filename that has been generated by preprareTrain
		"""
		dependencyDir = os.path.join(self.rootDir, 'dependencies')
		command = dependencyDir+"/wapiti-1.4.0/wapiti train -p "+self.rootDir+"/KB/config/wapiti/pattern_ref -2 "+str(penalty)+" "+self.dirResult+fichier+" "+directory+modelname
		process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
		process.wait()	

		return
	

	def runTest(self, directory, fichier, modelname, addStr="") :
		"""
		Run CRF test module from Wapiti software to label new data
		
		Parameters
		----------
		directory : string 
			directory where we save the model
		fichier : string
			filename that has been generated by preprareTest
		"""
		dependencyDir = os.path.join(self.rootDir, 'dependencies')
		command = dependencyDir+"/wapiti-1.4.0/wapiti label -m "+directory+modelname+" "+self.dirResult+fichier+" "+self.dirResult+"testEstCRF"+addStr+"_Wapiti.txt"
		process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
		process.wait()
	
		'Create testEstCRF.txt keeping only predicted labels'
		fafter = open(self.dirResult+"testEstCRF"+addStr+".txt", 'w')
		for line in open(self.dirResult+"testEstCRF"+addStr+"_Wapiti.txt", 'r') :
			line = line.split()
			if len(line) > 0 : 
				fafter.write(str(line[len(line)-1]))
				fafter.write("\n")
			else : fafter.write("\n")
		fafter.close()
		if addStr == "" :
			self.generateXml.simpleComp(self.dirResult+"testdata_CRF.txt", self.dirResult+'testEstCRF.txt', 2, self.dirResult+'testEstCRF.xml')	
		return
	
	
	def postProcessTest(self, fnameCRFresult, fnameCRFtoAdd, refsAfterSVM):
		"""
		Post-processing of labeling result. After a normal CRF labeling, we return to the SVM classification result,
		then check which notes should be annotated as non-bibliographic ones, then actually modify the labeling result.
		
		Parameters
		----------
		fnameCRFresult : string 
			directory where we save the model
		fnameCRFtoAdd : string
			filename that has been generated by preprareTest
		refsAfterSVM : list of 'Reference' objects
			reference list containing SVM classification result
		"""
		
		fbefore = open(self.dirResult+fnameCRFresult, 'r')
		fafter = open(self.dirResult+fnameCRFtoAdd, 'w')
		
		for reference in refsAfterSVM :
			if reference.train != -1 :
				line = fbefore.readline()
				while (len(line.split()) > 0) :
					fafter.write(str(line))
					line = fbefore.readline()
				fafter.write("\n")
			elif len(reference.getWord()) > 0 : # if there is no word in the reference, it was already ignored in printing before
				line = fbefore.readline()
				while (len(line.split()) > 0) :
					fafter.write("nonbibl \n")
					line = fbefore.readline()
				fafter.write("\n")
		fafter.close()
		fbefore.close()
		
		self.generateXml.simpleComp(self.dirResult+"testdata_CRF.txt", self.dirResult+fnameCRFtoAdd, 2, self.dirResult+'testEstCRF.xml')	
		
		return