def runOnSplit(penalties, constants, split): "Running on a " + str(split*100) + '/' + str((1-split)*100) + ' split' fe = FeatureExtractor(split) featurized = fe.featurizeFiles('../data') classNames = featurized[0] trainMatrix, trainLabels = featurized[1:3] devMatrix, devLabels = featurized[3:5] trainFiles, devFiles = featurized[5:] classCounts = Counter() for l in devLabels: classCounts[l] += 1 for penalty in penalties: for C in constants: print "\nPenalty, regularization: ", str(penalty), str(C) abstractModel = LogisticRegression() model = abstractModel.scikit(penalty, C) model_params = (penalty, C) model.fit(trainMatrix, trainLabels) errors, rankedExamples = Counter(), [] score = model.score(devMatrix, devLabels) predicted_labels = model.predict(devMatrix) probs = model.predict_proba(devMatrix) for j,pred in enumerate(predicted_labels): if not pred == devLabels[j]: errors[devLabels[j]] += 1 for i, p in enumerate(probs): rankedExamples.append((p, devFiles[i], predicted_labels[i] == devLabels[i])) results = '' for i, c in enumerate(classNames): missRate = str(float(errors[i]) / classCounts[i]) results += '\t' + c + ' error: ' + missRate + '\n' results += '\tScore: ' + str(score) fileName = 'results/scores/LRsplit' for param in model_params: fileName += '_' + str(param) fileName += '.txt' with open(fileName, 'w') as f: f.write(results) print results print '..ranking examples' if len(rankedExamples): examples = sorted(rankedExamples, key=lambda e: e[0][0]) fileName = 'results/rankedExamples/LRsplit_' + str(split*100) for param in model_params: fileName += '_' + str(param) fileName += '.txt' with open(fileName,'w') as f: for e in examples: results = e[1] results += '\n\t Probability of class ' results += classNames[0] + ': ' results += str(e[0][0]) results += '\n\t Correct: ' + str(e[2]) f.write(results)
# import vectorizeFiles as VF from sklearn.neighbors import KNeighborsClassifier#, DistanceMetric # import numpy as np # import getFileNames as gf # import sys # import scipy from sklearn import grid_search from feature_extractor import FeatureExtractor fe = FeatureExtractor(1) featurized = fe.featurizeFiles('../data') classNames, repubAndDemMatrix, labels = featurized[:3] # [repubAndDemMatrix,vectorizerRepubDem,labels]=VF.extractWordCounts(True,True,False) parameters = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10]} #,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]} #'weights':('uniform','distance'), 'p':[1, 2, 3, 4, 5] #'metric':('euclidean', 'manhattan','chebyshev','minkowski','jaccard','maching','dice','kulsinki','rogerstanimoto','russellrao','sokalmichener','sokalsneath'), kn = KNeighborsClassifier() clf = grid_search.GridSearchCV(kn, parameters) clf.fit(repubAndDemMatrix, labels) print clf.best_estimator_ #<-lots of detail print clf.best_params_ #<-more useful print clf.best_score_ #<-this is the cv error print clf.score(repubAndDemMatrix, labels) #<-training error #optimal parameter of 4 neighbors, best test error is 0.668573607933, best training error is 0.828488372093 # if we use shuffles the training data so that it is not all democrats and then all republicans, # we get an optimal param of 1 neighbor, .689 test error, 1.0 training error