def testLoad(self): filename = 'C:\\Storage\\vft11ccas\\Source\\DataUCI\\pima\\pima-indians-diabetes.data' dataset = lsData.loadFromFile(filename) self.assertTrue(dataset.nItems >= 2) self.assertTrue(dataset.nFeatures > 0) trainSampleSize = dataset.nItems / 3 samples = dataset.split(trainSampleSize) self.assertEqual(samples.TrainSample.nItems, trainSampleSize) self.assertEqual(samples.TestSample.nItems, dataset.nItems - trainSampleSize)
__author__ = 'Alexander Frey' # Import modules import numpy as np import statsmodels.api as sm import LinearSampling.api as lsApi import LinearSampling.dataset as lsData np.random.mtrand.seed(10) # Load dataset (Pima from UDI repository; 768 items, 8 features) dataset = lsData.loadFromFile('pima-indians-diabetes.data') dataset.X = sm.add_constant(dataset.X, prepend=False) # Split dataset into train and test samples samples = dataset.split(dataset.nItems / 2) trainSample = samples.TrainSample testSample = samples.TestSample # Tune logistic regression and show the model logisticRegression = sm.GLM(trainSample.target, trainSample.X, family=sm.families.Binomial()) model = logisticRegression.fit().params print "Model : [ " + ", ".join(format(x, ".3f") for x in model) + " ]" # Check error rate on train and test sample trainPredictions = logisticRegression.predict(model, trainSample.X) trainErrorRate = float(sum((trainPredictions > 0.5) != trainSample.target)) / trainSample.nItems testPredictions = logisticRegression.predict(model, testSample.X) testErrorRate = float(sum((testPredictions > 0.5) != testSample.target)) / testSample.nItems print "Train error rate : " + '%.3f' % (100 * trainErrorRate) + " %"