示例#1
0
def main():
	global thresholdIG
	#builds the data vectors for both datasets..
	#liFeatures,trainVectors, testVectors = buildDataVectors(DataSet.Three, FeatureSelection.DocFrequency)
	#liFeatures,trainVectors, testVectors = buildDataVectors(DataSet.Two, FeatureSelection.InformationGain)	
	#print testVectors[0], len(testVectors)
	#logisticRegression.runLogisticRegression(liFeatures, trainVectors, testVectors)
	handleYelpDataset()
	exit()
	
	numTries = 10
	logResults = {} #dictionary to a tuple of (runtime in seconds, accuracy)
	naiveResults = {} 
	randomResults = {}				
	liThresholds=[]
	sum1 = 0
	for i in range(numTries):
		liThresholds.append(sum1)
		sum1 += 0.05 #remove 5% extra terms each time..
		#print sum1
		
	#lids = [DataSet.One, DataSet.Two]
	lids = [DataSet.Two]
	liLenFeatures = []
	for ds in lids:
		if ds ==  DataSet.One:
			print "="*40
			print "Using data set one."
			print "="*40
		else:
			print "="*40
			print "Using data set two."			
			print "="*40						

		for k in range(numTries): #total 10 different number of features..			
			thresholdIG = liThresholds[k]		
			liFeatures,trainVectors, testVectors = buildDataVectors(ds, FeatureSelection.InformationGain)		
			liLenFeatures.append(len(liFeatures))
			lAccuracy = 0
			#perform logistic regression..
			start_time = time.time()
			lAccuracy  = logisticRegression.runLogisticRegression(liFeatures, trainVectors, testVectors)
			logResults[len(liFeatures)] = (time.time() - start_time, lAccuracy)
		
		
			#perform naive Bayes...
			start_time = time.time()
			lAccuracy  = naiveBayes.runNaiveBayes(liFeatures, trainVectors, testVectors)			
			naiveResults[len(liFeatures)] = (time.time() - start_time, lAccuracy)
		
			#perform random forest...
			#TODO
	
		liLenFeatures.sort()
			
		print "-"*40
		print "Printing results with Logistic Regression"
		print "-"*40
		print "(numFeatures)\t(accuracy)\t(runTime)"
		for k in liLenFeatures:
			print k, "\t", logResults[k][1], "\t", logResults[k][0]

		print "-"*40
		print "Printing results with Naive Bayes"
		print "-"*40
		print "(numFeatures)\t(accuracy)\t(runTime)"
		for k in liLenFeatures:
			print k, "\t", naiveResults[k][1], "\t", naiveResults[k][0]

		''' 
示例#2
0
def main():
    global thresholdIG
    #builds the data vectors for both datasets..
    #liFeatures,trainVectors, testVectors = buildDataVectors(DataSet.Three, FeatureSelection.DocFrequency)
    #liFeatures,trainVectors, testVectors = buildDataVectors(DataSet.Two, FeatureSelection.InformationGain)
    #print testVectors[0], len(testVectors)
    #logisticRegression.runLogisticRegression(liFeatures, trainVectors, testVectors)
    handleYelpDataset()
    exit()

    numTries = 10
    logResults = {}  #dictionary to a tuple of (runtime in seconds, accuracy)
    naiveResults = {}
    randomResults = {}
    liThresholds = []
    sum1 = 0
    for i in range(numTries):
        liThresholds.append(sum1)
        sum1 += 0.05  #remove 5% extra terms each time..
        #print sum1

    #lids = [DataSet.One, DataSet.Two]
    lids = [DataSet.Two]
    liLenFeatures = []
    for ds in lids:
        if ds == DataSet.One:
            print "=" * 40
            print "Using data set one."
            print "=" * 40
        else:
            print "=" * 40
            print "Using data set two."
            print "=" * 40

        for k in range(numTries):  #total 10 different number of features..
            thresholdIG = liThresholds[k]
            liFeatures, trainVectors, testVectors = buildDataVectors(
                ds, FeatureSelection.InformationGain)
            liLenFeatures.append(len(liFeatures))
            lAccuracy = 0
            #perform logistic regression..
            start_time = time.time()
            lAccuracy = logisticRegression.runLogisticRegression(
                liFeatures, trainVectors, testVectors)
            logResults[len(liFeatures)] = (time.time() - start_time, lAccuracy)

            #perform naive Bayes...
            start_time = time.time()
            lAccuracy = naiveBayes.runNaiveBayes(liFeatures, trainVectors,
                                                 testVectors)
            naiveResults[len(liFeatures)] = (time.time() - start_time,
                                             lAccuracy)

            #perform random forest...
            #TODO

        liLenFeatures.sort()

        print "-" * 40
        print "Printing results with Logistic Regression"
        print "-" * 40
        print "(numFeatures)\t(accuracy)\t(runTime)"
        for k in liLenFeatures:
            print k, "\t", logResults[k][1], "\t", logResults[k][0]

        print "-" * 40
        print "Printing results with Naive Bayes"
        print "-" * 40
        print "(numFeatures)\t(accuracy)\t(runTime)"
        for k in liLenFeatures:
            print k, "\t", naiveResults[k][1], "\t", naiveResults[k][0]
        ''' 
示例#3
0
def handleYelpDataset():
	liFeatures,trainVectors, testVectors = buildDataVectors(DataSet.Yelp, FeatureSelection.InformationGain)
	print "running log reg now.."
	logisticRegression.runLogisticRegression(liFeatures, trainVectors, testVectors)
示例#4
0
def handleYelpDataset():
    liFeatures, trainVectors, testVectors = buildDataVectors(
        DataSet.Yelp, FeatureSelection.InformationGain)
    print "running log reg now.."
    logisticRegression.runLogisticRegression(liFeatures, trainVectors,
                                             testVectors)