def kFoldCrossValidation(all_training_data, trainingphrases_to_sentiment, vocabulary, k=5, classifier="linear"): random.shuffle(all_training_data) number_of_examples = len(all_training_data) size_of_test_set = number_of_examples / k accuracy = 0.0 for i in xrange(0, number_of_examples - size_of_test_set + 1, size_of_test_set): trainingdata = all_training_data[:i] + all_training_data[ i + size_of_test_set:] testdata = all_training_data[i:i + size_of_test_set] print "Building Feature Vectors..." X_train, Y_train = getXandY(trainingdata, vocabulary, trainingphrases_to_sentiment, tfidf=False, L1normalization=False, L2normalization=True) X_test, Y_test = getXandY(testdata, vocabulary, trainingphrases_to_sentiment, tfidf=False, L1normalization=False, L2normalization=True) print "Training..." if classifier == "linear": model = svm.LinearSVC() model.fit(X_train, Y_train) elif classifier == "ovo": model = svm.SVC() model.fit(X_train, Y_train) elif classifier == "NB": model = MultinomialNB() model.fit(X_train, Y_train) elif classifier == "SGD": model = linear_model.SGDClassifier() model.fit(X_train, Y_train) elif classifier == "centroid": model = NearestCentroid() model.fit(X_train, Y_train) elif classifier == "perceptron": model = Perceptron() model.fit(X_train, Y_train) elif classifier == "ridge": model = RidgeClassifier() model.fit(X_train, Y_train) elif classifier == "levels": #coarse classifier print "Training top level" Y_train_coarse = [] for y in Y_train: if int(y) < 2: Y_train_coarse.append("low") elif int(y) == 2: Y_train_coarse.append("2") else: Y_train_coarse.append("high") model = LevelClassifier() top = svm.LinearSVC() top.fit(X_train, Y_train_coarse) model.add_classifier(top, 1, "") print "Training Low" X_low, Y_low = getXandY(trainingdata, vocabulary, trainingphrases_to_sentiment, tfidf=False, L1normalization=False, L2normalization=True, separate="low") low = svm.LinearSVC() low.fit(X_low, Y_low) model.add_classifier(low, 2, "low") print "Training High" X_high, Y_high = getXandY(trainingdata, vocabulary, trainingphrases_to_sentiment, tfidf=False, L1normalization=False, L2normalization=True, separate="high") high = svm.LinearSVC() high.fit(X_high, Y_high) model.add_classifier(high, 2, "high") print "Testing..." accuracy += model.score(X_test, Y_test) return accuracy / k