Пример #1
0
	def measureFrequencies(self):
		#creates a new table of frequencies
		self.tableOfFrequencies=frecuencyTable()
		#get the words of the message
		words=self.triple['message'].split(" ");
		#count how many times is each word
		for word in words:
			self.tableOfFrequencies.add(word)
Пример #2
0
def trainSupervisedSVM(pathOfDataFile,percentageOfSamples,pmiLowerBound):

	#table frquencuency of all the words in the messages
	frecuencies=frecuencyTable()
	
	#Read the file and convert triples into objects
	
	#read file with messages
	#Data/terraReducedTest.csv
	listOfTriples=readCSV(pathOfDataFile)
	
	listOfData=[]
	#convert the triples to objects
	for triple in listOfTriples:
		listOfData.append(Instance(triple))
		
	#stores the vocabulry  of the docs
	setOfWords=Set()
	#stores the set of labels found in the instnaces
	setOfLabels=Set()
	for instance in listOfData:
		print "cleaning: "+str(instance.triple['id'])

		#stores the labels


		#clean message
		instance.cleanMessage()
		instance.measureFrequencies()

		if(instance.triple['label']!=None and instance.triple['label']!=''):
			setOfLabels.add(instance.triple['label'])

		print setOfLabels
		#gathers the frequencies in each message
		#add each word to the setOfWords(vocabulary)
		currentVocabulary=instance.getFrecuencyTable().getKeys()
		for v in currentVocabulary:
			setOfWords.add(v)

		for word in instance.triple['message'].split(" "):
			frecuencies.add(word)

	listOfWordsByValue=frecuencies.sort_by_value()
	print "words by frequencie---"
	for wordd in listOfWordsByValue:
		print wordd
	print "--------------------------"

	print "looking for PMI"
	#get the instances which are annotated
	listOfAnnotatedData=[]
	listOfUnnanotatedData=[]
	for instance in listOfData:
		if instance.triple['label']!="":
			listOfAnnotatedData.append(instance)
		else:
			listOfUnnanotatedData.append(instance)
	currentNumberOfSeedsPerLabel={}
	for key in setOfLabels:
		currentNumberOfSeedsPerLabel[key]=0

	#there should be an equal number of seeds for each label
	percentage=percentageOfSamples
	numberOfSeeds=len(listOfData)*percentage
	currentNumberOfSeeds=0

	numberOfSeedsPerLabel=math.floor(numberOfSeeds/(1.0*len(setOfLabels)))
	numberOfSeeds=numberOfSeedsPerLabel*len(setOfLabels)
	listForAuxiliaryTraining=[]
	listOfTrainingData_=[]
	SetOfSeeds2=Set()
	for instance in listOfData:
		if ( (not instance.triple['label']=='') and (not instance.triple['label']==None) ):
			#if the instance is between the first 1000 then it is  a seed otherwise it is test
			if(currentNumberOfSeedsPerLabel[instance.triple['label']]<numberOfSeedsPerLabel and  not instance.triple['message'] in SetOfSeeds2):
				currentNumberOfSeedsPerLabel[instance.triple['label']]=currentNumberOfSeedsPerLabel[instance.triple['label']]+1
				SetOfSeeds2.add(instance.triple['message'])
				listOfTrainingData_.append(instance)
			else:
				listForAuxiliaryTraining.append(instance)
		


	listOfPMI=getSetOfWordsPerLabel(setOfLabels,setOfWords,listOfTrainingData_,"PMI")
	#the words whose PMI are over a threshold
	setOfSelectedWords=Set()




	#of dimensions
	numberOfDimensions=1000000000000000000000000000000000000000000
	for Keyqueue in listOfPMI.keys():
		queue=listOfPMI[Keyqueue]
		currentCount=0
		while not queue.empty() and currentCount<numberOfDimensions:
			pmi=queue.get()[1]
			
			if(pmi['pmi']>pmiLowerBound): #not taking into account the pmi
				#print pmi['word']+"--"+str(pmi['pmi'])+"--"+pmi['label']
				currentCount=currentCount+1
				setOfSelectedWords.add(pmi['word'])

	totalNumberOfDiffWords=int(math.ceil(len(listOfWordsByValue)*0.4))
	listOfWordsByValue.reverse()
	counter=0
	for wordd in listOfWordsByValue:
		if(counter==totalNumberOfDiffWords):
			break
		print listOfWordsByValue
		counter=counter+1
		setOfSelectedWords.add(wordd[1])



	
	#of dimensions
	numberOfDimensions=1000000000000000000000000000000000000000000
	for Keyqueue in listOfPMI.keys():
		queue=listOfPMI[Keyqueue]
		currentCount=0
		while not queue.empty() and currentCount<numberOfDimensions:
			pmi=queue.get()[1]
			
			if(pmi['pmi']>pmiLowerBound): #not taking into account the pmi
				#print pmi['word']+"--"+str(pmi['pmi'])+"--"+pmi['label']
				currentCount=currentCount+1
				setOfSelectedWords.add(pmi['word'])


	#train a set of Classifiers for words
	print "training classifiers"
	#setOfClassifiers=trainPredictors(listOfData,setOfSelectedWords,setOfWords)
	

	#once the classifiers are trained get the

	#creates a file for fpgrowth
	contentFileForFPGrowth=""

	#creates the vector for each instance
	print "creating vectors for each message"
	instanceVectors=[]
	for instance in listOfData:
		#for word in setOfWords: #when generating vectors with all the words in the vocabulary
		for word in setOfSelectedWords: #when generating vectors with just the words above the MPI threshold
			#using linear classs
			#if(instance.getFrecuencyTable().get(word)*1.0>100.0):
			#	instance.vector.append(instance.getFrecuencyTable().get(word)*1.0)
			#else:
			#	vocabulary_temp=deepcopy(setOfWords)
			#	if(word in setOfWords):
			#		vocabulary_temp.remove(word)
			#	vectorRepresentation=instance.getVectorRepresentation(vocabulary_temp)
			#	label=setOfClassifiers[word].predict(vectorRepresentation)
			#	if(label[0]>0.0):
			#		print "calculated label: "+str(label)
			#	instance.vector.append(label[0])
			#/using linearclass
			instance.vector.append(instance.getFrecuencyTable().get(word)*1.0) #if prediction does not matter
			if(instance.getFrecuencyTable().get(word)>0):
				contentFileForFPGrowth=contentFileForFPGrowth+" "+word
		contentFileForFPGrowth=contentFileForFPGrowth+"\n"		
		instanceVectors.append(instance.vector)

		FPgrowthFile=open('fpgrowthdata','w')
		FPgrowthFile.write(contentFileForFPGrowth)


	

			
		
		
	

	#SVD
	matrix =np.matrix(instanceVectors)
	print "calculating tf-idf"
	matrix=	tfidfTransform(instanceVectors)
	print "calculatin svd"
	matrixLSA=matrix
	#matrixLSA=svdDimensionalityReduction(matrix,1)

	#print matrixLSA

	print "calculating the graph files for Junto"
	


	#creates a junt graph
	#createJuntoGraph('input_graph',instaceVectors,matrixLSA)
	



	#trains a classifier for a label on all the data
	#trainSVMPredictoForLabels(listOfData,setOfLabels,matrixLSA)
	
	currentNumberOfSeedsPerLabel={}
	for key in setOfLabels:
		currentNumberOfSeedsPerLabel[key]=0



	#this defines the number of seeds(annotated data for the algorithm)
	
	
	currentNumberOfSeedsPerLabel={}
	for key in setOfLabels:
		currentNumberOfSeedsPerLabel[key]=0

	#there should be an equal number of seeds for each label
	percentage=percentageOfSamples
	numberOfSeeds=len(instanceVectors)*percentage
	currentNumberOfSeeds=0

	numberOfSeedsPerLabel=math.floor(numberOfSeeds/(1.0*len(setOfLabels)))
	numberOfSeeds=numberOfSeedsPerLabel*len(setOfLabels)

	#creates the gold_labels for Junto( the instnaces whose label is known)
	#seed files refer to those instances which label is already given
	seedFileContent=""
	seedFile=open("seeds",'w')

	#training set of instances
	trainingListOfdata=[]
	#training set of vectors
	trainingMatrix=[]

	#testData
	testListOfdata=[]
	testMatrix=[]


	#gold file refers to the goldstandard towards the perfomrance is measureed
	goldFileContent=""
	goldFile=open("gold_labels",'w')
	counter_=0
	SetOfSeeds=Set()

	for instance in listOfData:
		if ( (not instance.triple['label']=='') and (not instance.triple['label']==None) ):
			#if the instance is between the first 1000 then it is  a seed otherwise it is test
			if(currentNumberOfSeedsPerLabel[instance.triple['label']]<numberOfSeedsPerLabel and  not instance.triple['message'] in SetOfSeeds):
				seedFileContent=seedFileContent+str(instance.triple['id'])+"\t"+instance.triple['label']+"\t"+"1.0\n"
				currentNumberOfSeedsPerLabel[instance.triple['label']]=currentNumberOfSeedsPerLabel[instance.triple['label']]+1
				trainingListOfdata.append(instance)
				trainingMatrix.append(matrixLSA[counter_])
				SetOfSeeds.add(instance.triple['message'])
			else:
				goldFileContent=goldFileContent+str(instance.triple['id'])+"\t"+instance.triple['label']+"\t"+"1.0\n"
				testListOfdata.append(instance)
				testMatrix.append(matrixLSA[counter_])
		counter_=counter_+1

	seedFile.write(seedFileContent)
	goldFile.write(goldFileContent)


	#train an svm classifier for the given samples
	print "len of training data:"+str(len(trainingListOfdata))
	#pair of positivePredictions, numberOfPredictions
	numberOfSamplesPerLabel={}
	dictOfPresicion={}


	for label in setOfLabels:
		numberOfSamplesPerLabel[label]=0
		dictOfPresicion[label]=[0,0]

	listOfClassifiers=trainSVMPredictoForLabels(trainingListOfdata,setOfLabels,trainingMatrix)
	countOfRightClassifications=0
	countOfPredictions=0
	notClassified=0
	for i in range(0, len(testListOfdata)):
		numberOfSamplesPerLabel[testListOfdata[i].triple['label']]=numberOfSamplesPerLabel[testListOfdata[i].triple['label']]+1
		for label in setOfLabels:
			
			
			prediction=listOfClassifiers[label].predict(testMatrix[i])[0]
			print "predicttion of:: "+label+":"+str(prediction)+"__real:"+testListOfdata[i].triple['label']
			countOfPredictions=countOfPredictions+1
			if(prediction==1.0):
				dictOfPresicion[label][1]=dictOfPresicion[label][1]+1
				print "predicted:: "+label+"__real:"+testListOfdata[i].triple['label']
				if(label==testListOfdata[i].triple['label']):
					dictOfPresicion[label][0]=dictOfPresicion[label][0]+1
					countOfRightClassifications=countOfRightClassifications+1
					
			else:
				
				print "predicted:: "+label+"__real:"+testListOfdata[i].triple['label']
				if(label!=testListOfdata[i].triple['label']):
					countOfRightClassifications=countOfRightClassifications+1

	print "len of testdata:"+str(len(testListOfdata))
	print "right class:"+str(countOfRightClassifications)
	print "number of predctions:"+str(countOfPredictions)
	print "accuracy: "+str(countOfRightClassifications/(countOfPredictions*1.0))

	print "-----------------------"
	for label in setOfLabels:
		if(numberOfSamplesPerLabel[label]>0):
			print "***"+label+"***"
			presition=0
			recall=0
			if(dictOfPresicion[label][1]>0):
				presition=dictOfPresicion[label][0]/(dictOfPresicion[label][1]*1.0)
				print "presition:"+str(presition)
			else:
				print "presition: none instance was classified done"
			recall=dictOfPresicion[label][0]/(numberOfSamplesPerLabel[label]*1.0)
			print "recall:"+str(recall)
			if(presition+recall>0.00000000000000000000000000000):
				print "fscore: "+str((2.0*presition*recall)/(presition+recall))
			print "---"