def measureFrequencies(self): #creates a new table of frequencies self.tableOfFrequencies=frecuencyTable() #get the words of the message words=self.triple['message'].split(" "); #count how many times is each word for word in words: self.tableOfFrequencies.add(word)
def trainSupervisedSVM(pathOfDataFile,percentageOfSamples,pmiLowerBound): #table frquencuency of all the words in the messages frecuencies=frecuencyTable() #Read the file and convert triples into objects #read file with messages #Data/terraReducedTest.csv listOfTriples=readCSV(pathOfDataFile) listOfData=[] #convert the triples to objects for triple in listOfTriples: listOfData.append(Instance(triple)) #stores the vocabulry of the docs setOfWords=Set() #stores the set of labels found in the instnaces setOfLabels=Set() for instance in listOfData: print "cleaning: "+str(instance.triple['id']) #stores the labels #clean message instance.cleanMessage() instance.measureFrequencies() if(instance.triple['label']!=None and instance.triple['label']!=''): setOfLabels.add(instance.triple['label']) print setOfLabels #gathers the frequencies in each message #add each word to the setOfWords(vocabulary) currentVocabulary=instance.getFrecuencyTable().getKeys() for v in currentVocabulary: setOfWords.add(v) for word in instance.triple['message'].split(" "): frecuencies.add(word) listOfWordsByValue=frecuencies.sort_by_value() print "words by frequencie---" for wordd in listOfWordsByValue: print wordd print "--------------------------" print "looking for PMI" #get the instances which are annotated listOfAnnotatedData=[] listOfUnnanotatedData=[] for instance in listOfData: if instance.triple['label']!="": listOfAnnotatedData.append(instance) else: listOfUnnanotatedData.append(instance) currentNumberOfSeedsPerLabel={} for key in setOfLabels: currentNumberOfSeedsPerLabel[key]=0 #there should be an equal number of seeds for each label percentage=percentageOfSamples numberOfSeeds=len(listOfData)*percentage currentNumberOfSeeds=0 numberOfSeedsPerLabel=math.floor(numberOfSeeds/(1.0*len(setOfLabels))) numberOfSeeds=numberOfSeedsPerLabel*len(setOfLabels) listForAuxiliaryTraining=[] listOfTrainingData_=[] SetOfSeeds2=Set() for instance in listOfData: if ( (not instance.triple['label']=='') and (not instance.triple['label']==None) ): #if the instance is between the first 1000 then it is a seed otherwise it is test if(currentNumberOfSeedsPerLabel[instance.triple['label']]<numberOfSeedsPerLabel and not instance.triple['message'] in SetOfSeeds2): currentNumberOfSeedsPerLabel[instance.triple['label']]=currentNumberOfSeedsPerLabel[instance.triple['label']]+1 SetOfSeeds2.add(instance.triple['message']) listOfTrainingData_.append(instance) else: listForAuxiliaryTraining.append(instance) listOfPMI=getSetOfWordsPerLabel(setOfLabels,setOfWords,listOfTrainingData_,"PMI") #the words whose PMI are over a threshold setOfSelectedWords=Set() #of dimensions numberOfDimensions=1000000000000000000000000000000000000000000 for Keyqueue in listOfPMI.keys(): queue=listOfPMI[Keyqueue] currentCount=0 while not queue.empty() and currentCount<numberOfDimensions: pmi=queue.get()[1] if(pmi['pmi']>pmiLowerBound): #not taking into account the pmi #print pmi['word']+"--"+str(pmi['pmi'])+"--"+pmi['label'] currentCount=currentCount+1 setOfSelectedWords.add(pmi['word']) totalNumberOfDiffWords=int(math.ceil(len(listOfWordsByValue)*0.4)) listOfWordsByValue.reverse() counter=0 for wordd in listOfWordsByValue: if(counter==totalNumberOfDiffWords): break print listOfWordsByValue counter=counter+1 setOfSelectedWords.add(wordd[1]) #of dimensions numberOfDimensions=1000000000000000000000000000000000000000000 for Keyqueue in listOfPMI.keys(): queue=listOfPMI[Keyqueue] currentCount=0 while not queue.empty() and currentCount<numberOfDimensions: pmi=queue.get()[1] if(pmi['pmi']>pmiLowerBound): #not taking into account the pmi #print pmi['word']+"--"+str(pmi['pmi'])+"--"+pmi['label'] currentCount=currentCount+1 setOfSelectedWords.add(pmi['word']) #train a set of Classifiers for words print "training classifiers" #setOfClassifiers=trainPredictors(listOfData,setOfSelectedWords,setOfWords) #once the classifiers are trained get the #creates a file for fpgrowth contentFileForFPGrowth="" #creates the vector for each instance print "creating vectors for each message" instanceVectors=[] for instance in listOfData: #for word in setOfWords: #when generating vectors with all the words in the vocabulary for word in setOfSelectedWords: #when generating vectors with just the words above the MPI threshold #using linear classs #if(instance.getFrecuencyTable().get(word)*1.0>100.0): # instance.vector.append(instance.getFrecuencyTable().get(word)*1.0) #else: # vocabulary_temp=deepcopy(setOfWords) # if(word in setOfWords): # vocabulary_temp.remove(word) # vectorRepresentation=instance.getVectorRepresentation(vocabulary_temp) # label=setOfClassifiers[word].predict(vectorRepresentation) # if(label[0]>0.0): # print "calculated label: "+str(label) # instance.vector.append(label[0]) #/using linearclass instance.vector.append(instance.getFrecuencyTable().get(word)*1.0) #if prediction does not matter if(instance.getFrecuencyTable().get(word)>0): contentFileForFPGrowth=contentFileForFPGrowth+" "+word contentFileForFPGrowth=contentFileForFPGrowth+"\n" instanceVectors.append(instance.vector) FPgrowthFile=open('fpgrowthdata','w') FPgrowthFile.write(contentFileForFPGrowth) #SVD matrix =np.matrix(instanceVectors) print "calculating tf-idf" matrix= tfidfTransform(instanceVectors) print "calculatin svd" matrixLSA=matrix #matrixLSA=svdDimensionalityReduction(matrix,1) #print matrixLSA print "calculating the graph files for Junto" #creates a junt graph #createJuntoGraph('input_graph',instaceVectors,matrixLSA) #trains a classifier for a label on all the data #trainSVMPredictoForLabels(listOfData,setOfLabels,matrixLSA) currentNumberOfSeedsPerLabel={} for key in setOfLabels: currentNumberOfSeedsPerLabel[key]=0 #this defines the number of seeds(annotated data for the algorithm) currentNumberOfSeedsPerLabel={} for key in setOfLabels: currentNumberOfSeedsPerLabel[key]=0 #there should be an equal number of seeds for each label percentage=percentageOfSamples numberOfSeeds=len(instanceVectors)*percentage currentNumberOfSeeds=0 numberOfSeedsPerLabel=math.floor(numberOfSeeds/(1.0*len(setOfLabels))) numberOfSeeds=numberOfSeedsPerLabel*len(setOfLabels) #creates the gold_labels for Junto( the instnaces whose label is known) #seed files refer to those instances which label is already given seedFileContent="" seedFile=open("seeds",'w') #training set of instances trainingListOfdata=[] #training set of vectors trainingMatrix=[] #testData testListOfdata=[] testMatrix=[] #gold file refers to the goldstandard towards the perfomrance is measureed goldFileContent="" goldFile=open("gold_labels",'w') counter_=0 SetOfSeeds=Set() for instance in listOfData: if ( (not instance.triple['label']=='') and (not instance.triple['label']==None) ): #if the instance is between the first 1000 then it is a seed otherwise it is test if(currentNumberOfSeedsPerLabel[instance.triple['label']]<numberOfSeedsPerLabel and not instance.triple['message'] in SetOfSeeds): seedFileContent=seedFileContent+str(instance.triple['id'])+"\t"+instance.triple['label']+"\t"+"1.0\n" currentNumberOfSeedsPerLabel[instance.triple['label']]=currentNumberOfSeedsPerLabel[instance.triple['label']]+1 trainingListOfdata.append(instance) trainingMatrix.append(matrixLSA[counter_]) SetOfSeeds.add(instance.triple['message']) else: goldFileContent=goldFileContent+str(instance.triple['id'])+"\t"+instance.triple['label']+"\t"+"1.0\n" testListOfdata.append(instance) testMatrix.append(matrixLSA[counter_]) counter_=counter_+1 seedFile.write(seedFileContent) goldFile.write(goldFileContent) #train an svm classifier for the given samples print "len of training data:"+str(len(trainingListOfdata)) #pair of positivePredictions, numberOfPredictions numberOfSamplesPerLabel={} dictOfPresicion={} for label in setOfLabels: numberOfSamplesPerLabel[label]=0 dictOfPresicion[label]=[0,0] listOfClassifiers=trainSVMPredictoForLabels(trainingListOfdata,setOfLabels,trainingMatrix) countOfRightClassifications=0 countOfPredictions=0 notClassified=0 for i in range(0, len(testListOfdata)): numberOfSamplesPerLabel[testListOfdata[i].triple['label']]=numberOfSamplesPerLabel[testListOfdata[i].triple['label']]+1 for label in setOfLabels: prediction=listOfClassifiers[label].predict(testMatrix[i])[0] print "predicttion of:: "+label+":"+str(prediction)+"__real:"+testListOfdata[i].triple['label'] countOfPredictions=countOfPredictions+1 if(prediction==1.0): dictOfPresicion[label][1]=dictOfPresicion[label][1]+1 print "predicted:: "+label+"__real:"+testListOfdata[i].triple['label'] if(label==testListOfdata[i].triple['label']): dictOfPresicion[label][0]=dictOfPresicion[label][0]+1 countOfRightClassifications=countOfRightClassifications+1 else: print "predicted:: "+label+"__real:"+testListOfdata[i].triple['label'] if(label!=testListOfdata[i].triple['label']): countOfRightClassifications=countOfRightClassifications+1 print "len of testdata:"+str(len(testListOfdata)) print "right class:"+str(countOfRightClassifications) print "number of predctions:"+str(countOfPredictions) print "accuracy: "+str(countOfRightClassifications/(countOfPredictions*1.0)) print "-----------------------" for label in setOfLabels: if(numberOfSamplesPerLabel[label]>0): print "***"+label+"***" presition=0 recall=0 if(dictOfPresicion[label][1]>0): presition=dictOfPresicion[label][0]/(dictOfPresicion[label][1]*1.0) print "presition:"+str(presition) else: print "presition: none instance was classified done" recall=dictOfPresicion[label][0]/(numberOfSamplesPerLabel[label]*1.0) print "recall:"+str(recall) if(presition+recall>0.00000000000000000000000000000): print "fscore: "+str((2.0*presition*recall)/(presition+recall)) print "---"