示例#1
0
def kFoldCrossValidation(all_training_data,
                         trainingphrases_to_sentiment,
                         vocabulary,
                         k=5,
                         classifier="linear"):
    random.shuffle(all_training_data)
    number_of_examples = len(all_training_data)
    size_of_test_set = number_of_examples / k
    accuracy = 0.0

    for i in xrange(0, number_of_examples - size_of_test_set + 1,
                    size_of_test_set):

        trainingdata = all_training_data[:i] + all_training_data[
            i + size_of_test_set:]
        testdata = all_training_data[i:i + size_of_test_set]

        print "Building Feature Vectors..."
        X_train, Y_train = getXandY(trainingdata,
                                    vocabulary,
                                    trainingphrases_to_sentiment,
                                    tfidf=False,
                                    L1normalization=False,
                                    L2normalization=True)
        X_test, Y_test = getXandY(testdata,
                                  vocabulary,
                                  trainingphrases_to_sentiment,
                                  tfidf=False,
                                  L1normalization=False,
                                  L2normalization=True)

        print "Training..."
        if classifier == "linear":
            model = svm.LinearSVC()
            model.fit(X_train, Y_train)
        elif classifier == "ovo":
            model = svm.SVC()
            model.fit(X_train, Y_train)
        elif classifier == "NB":
            model = MultinomialNB()
            model.fit(X_train, Y_train)
        elif classifier == "SGD":
            model = linear_model.SGDClassifier()
            model.fit(X_train, Y_train)
        elif classifier == "centroid":
            model = NearestCentroid()
            model.fit(X_train, Y_train)
        elif classifier == "perceptron":
            model = Perceptron()
            model.fit(X_train, Y_train)
        elif classifier == "ridge":
            model = RidgeClassifier()
            model.fit(X_train, Y_train)
        elif classifier == "levels":
            #coarse classifier
            print "Training top level"
            Y_train_coarse = []
            for y in Y_train:
                if int(y) < 2:
                    Y_train_coarse.append("low")
                elif int(y) == 2:
                    Y_train_coarse.append("2")
                else:
                    Y_train_coarse.append("high")

            model = LevelClassifier()
            top = svm.LinearSVC()
            top.fit(X_train, Y_train_coarse)
            model.add_classifier(top, 1, "")

            print "Training Low"
            X_low, Y_low = getXandY(trainingdata,
                                    vocabulary,
                                    trainingphrases_to_sentiment,
                                    tfidf=False,
                                    L1normalization=False,
                                    L2normalization=True,
                                    separate="low")
            low = svm.LinearSVC()
            low.fit(X_low, Y_low)
            model.add_classifier(low, 2, "low")

            print "Training High"
            X_high, Y_high = getXandY(trainingdata,
                                      vocabulary,
                                      trainingphrases_to_sentiment,
                                      tfidf=False,
                                      L1normalization=False,
                                      L2normalization=True,
                                      separate="high")
            high = svm.LinearSVC()
            high.fit(X_high, Y_high)
            model.add_classifier(high, 2, "high")

        print "Testing..."
        accuracy += model.score(X_test, Y_test)

    return accuracy / k