示例#1
0
def main():
    # Load data
    X_train, y_train, X_test, y_test = load_data.load_data()
    # Transform data into a vector of TF-IDF values
    count_vect = CountVectorizer(ngram_range=(1, 2))
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer(use_idf=True)
    X_train_dtm = tfidf_transformer.fit_transform(X_train_counts)
    # Transform test data
    X_test_counts = count_vect.transform(X_test)
    X_test_dtm = tfidf_transformer.fit_transform(X_test_counts)

    # using default params
    clf = AdaBoostClassifier()
    clf.fit(X_train_dtm, y_train)
    y_pred_class = clf.predict(X_test_dtm)

    # utilities.print_misclassified_samples(X_test, y_pred_class, y_test)
    utilities.print_stats(y_pred_class, y_test)
示例#2
0
def main():
    # Load data
    X_train, y_train, X_test, y_test = load_data.load_data()
    # Transform data into a vector of TF-IDF values
    count_vect = CountVectorizer(ngram_range=(1, 2))
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer(use_idf=True)
    X_train_dtm = tfidf_transformer.fit_transform(X_train_counts)
    # Transform test data
    X_test_counts = count_vect.transform(X_test)
    X_test_dtm = tfidf_transformer.fit_transform(X_test_counts)

    # Using optimal value of alpha obtained using GridSearchCV
    clf = MultinomialNB(alpha=0.02)
    clf.fit(X_train_dtm, y_train)
    y_pred_class = clf.predict(X_test_dtm)

    # utilities.print_misclassified_samples(X_test, y_pred_class, y_test)
    utilities.print_stats(y_pred_class, y_test)
示例#3
0
def main():
    # Load data
    X_train, y_train, X_test, y_test = load_data.load_data()
    # Transform data into a vector of TF-IDF values
    count_vect = CountVectorizer(ngram_range=(1, 2))
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer(use_idf=True)
    X_train_dtm = tfidf_transformer.fit_transform(X_train_counts)
    # Transform test data
    X_test_counts = count_vect.transform(X_test)
    X_test_dtm = tfidf_transformer.fit_transform(X_test_counts)

    # Not optimized, probably need to test with l2 penalty also
    clf = LogisticRegression(penalty='l1')
    clf.fit(X_train_dtm, y_train)
    y_pred_class = clf.predict(X_test_dtm)

    # utilities.print_misclassified_samples(X_test, y_pred_class, y_test)
    utilities.print_stats(y_pred_class, y_test)
示例#4
0
def main():
    start_time = time.time()
    # Load data
    X_train, y_train, X_test, y_test = load_data.load_data()
    # Transform data into a vector of TF-IDF values
    count_vect = CountVectorizer(ngram_range=(1, 2))
    X_train_counts = count_vect.fit_transform(X_train)
    tfidf_transformer = TfidfTransformer(use_idf=True)
    X_train_dtm = tfidf_transformer.fit_transform(X_train_counts)
    # Transform test data
    X_test_counts = count_vect.transform(X_test)
    X_test_dtm = tfidf_transformer.fit_transform(X_test_counts)
    data_load_time = time.time() - start_time

    # Not optimized
    C = 1.0
    classifier_dict = {
        "SVC with linear kernel":
        svm.SVC(kernel='linear', C=C),
        "SVC with RBF kernel":
        svm.SVC(kernel='rbf', gamma=0.7, C=C),
        "SVC with polynomial (degree 3) kernel":
        svm.SVC(kernel='poly', degree=3, C=C),
        "LinearSVC (linear kernel)":
        svm.LinearSVC(C=C)
    }

    for key, clf in classifier_dict.iteritems():
        start_time = time.time()
        clf.fit(X_train_dtm, y_train)
        y_pred_class = clf.predict(X_test_dtm)
        end_time = time.time()

        print key
        # utilities.print_misclassified_samples(X_test, y_pred_class, y_test)
        utilities.print_stats(y_pred_class, y_test)
        print "Execution time={0} sec \n".format(end_time - start_time +
                                                 data_load_time)
示例#5
0
                if word in nb_dict_features.keys():
                    relative_word_occurence = nb_dict_features[word]
                    class_probability *= relative_word_occurence
                else:
                    class_probability *= 0
            Y_dict[label] = class_probability
        return self.get_max_value_key(Y_dict)

    def predict(self, X):
        self.predicted_Y_values = []
        n = len(X)
        for ii in range(0, n):
            X_elem = X[ii]
            prediction = self.classify_single_elem(X_elem)
            self.predicted_Y_values.append(prediction)
        return self.predicted_Y_values


if __name__ == '__main__':
    X_train, Y_train, X_test, Y_test = ld.load_data()
    start_time = time.time()
    for i in xrange(len(X_train)):
        X_train[i] = X_train[i].split()
    for i in xrange(len(X_test)):
        X_test[i] = X_test[i].split()
    nbc = NaiveBayesTextClassifier()
    nbc.train(X_train, Y_train)
    y_pred_class = nbc.predict(X_test)
    print 'Execution time={0} sec'.format(time.time() - start_time)
    utilities.print_stats(y_pred_class, Y_test)