from ensemble.EnsembleSelection import EnsembleSelection from ensemble.EnsembleClassifier import EnsembleClassifier from sklearn.svm import LinearSVC from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import f1_score from sklearn.metrics import accuracy_score from sklearn import cross_validation from sklearn import cross_validation from sklearn.datasets import fetch_20newsgroups from sklearn.linear_model import SGDClassifier if __name__ == "__main__": newsgroups_train = fetch_20newsgroups(subset='train') vectorizer = TfidfVectorizer() train_vectors = vectorizer.fit_transform(newsgroups_train.data) X_TRAIN , X_TEST , Y_TRAIN , Y_TEST = cross_validation.train_test_split(train_vectors,newsgroups_train.target, test_size=0.1, random_state=0) selection = EnsembleSelection() models = [SGDClassifier(loss="hinge", penalty="l2")]+[LinearSVC()]+selection.generate_logistic_regression_classifiers(1)+selection.generate_multionomial_nb_classifiers(1)+selection.generate_bernoulli_nb_classifiers(1) ensemble = selection.form_ensemble(X_TRAIN,Y_TRAIN,models,4,error_metric_name='f1') clf = EnsembleClassifier() y_predicted = clf.predict(X_TEST,ensemble) print f1_score(y_predicted,Y_TEST)