t0 = time() vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1)) X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("C:\\Users\\Mustafa\\Desktop\\aclImdb", shuffle=True, vectorizer=vect) feature_names = vect.get_feature_names() duration = time() - t0 print print "Loading took %0.2fs." % duration print print "Fitting the classifier" t0 = time() clf = TransparentLogisticRegression(penalty='l1', C=0.1) clf.fit(X_train, y_train) duration = time() - t0 print print "Fitting took %0.2fs." % duration print print "Predicting the evidences" t0 = time() neg_evi, pos_evi = clf.predict_evidences(X_test) duration = time() - t0
def testLR(): print "Loading the data" t0 = time() vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1)) X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("C:\\Users\\Mustafa\\Desktop\\aclImdb", shuffle=True, vectorizer=vect) feature_names = vect.get_feature_names() duration = time() - t0 print print "Loading took %0.2fs." % duration print print "Fitting the classifier" t0 = time() clf = TransparentLogisticRegression(penalty='l1', C=0.1) clf.fit(X_train, y_train) duration = time() - t0 print print "Fitting took %0.2fs." % duration print print "Predicting the evidences" t0 = time() neg_evi, pos_evi = clf.predict_evidences(X_test) duration = time() - t0 print print "Predicting evidences took %0.2fs." % duration print print "Predicting the probs" t0 = time() probs = clf.predict_proba(X_test) duration = time() - t0 print print "Predicting probs took %0.2fs." % duration print ti = TopInstances(neg_evi, pos_evi, clf.get_bias()) total_evi = neg_evi + pos_evi print print "Most negative" print i = ti.most_negatives()[0] print total_evi[i], neg_evi[i], pos_evi[i], probs[i] print test_corpus[i] print print "Most positive" print i = ti.most_positives()[0] print total_evi[i], neg_evi[i], pos_evi[i], probs[i] print test_corpus[i]
print "The # of positive instances in all data: ", np.sum(y) print "The ratio of positive instances: ", np.sum(y)/float(num_inst) print "The ratio of negative instances: ", 1-(np.sum(y)/float(len(y))) print "" ss = ShuffleSplit(num_inst, n_iter=1, test_size=0.33, random_state=2) for i, j in ss: train_index = i test_index = j print "Y_train total:", len(y[train_index]) print "Y_train positive:", np.sum(y[train_index]) print "Ratio of positive in train instances", np.sum(y[train_index])/float(len(y[train_index])) clf_ori = TransparentLogisticRegression() clf_ss = TransparentLogisticRegression() clf_ig = TransparentLogisticRegression() X_ss = scale(X) scale_ = decision_tree_scale() X_ig = scale_.fit_transform(X, y) clf_ori.fit(X, y) clf_ss.fit(X_ss, y) clf_ig.fit(X_ig, y) print clf_ori.coef_ print clf_ss.coef_
import numpy as np import pickle t0 = time() vect = Vectorizer(min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1)) X_train, y_train, X_val, y_val, X_test, y_test, train_corpus, val_corpus, test_corpus = load_imdb("./aclImdb", shuffle=True, vectorizer=vect) y_test_na = y_test[:, np.newaxis] y_test_na = np.append(y_test_na, 1-y_test_na, axis=1) y_val_na = y_val[:, np.newaxis] y_val_na = np.append(y_val_na, 1-y_val_na, axis=1) clf = Classifier() clf.fit(X_train, y_train) ctrl_clf = clf ctrl_error = ce_squared(y_test_na, clf.predict_proba(X_test)) ctrl_acc = clf.score(X_test, y_test) duration = time() - t0 print("Loading the dataset took {:0.2f}s.".format(duration), '\n') with open('clf12.arch', 'rb') as f: clf_arch = pickle.load(f) clf_arch.stats() ctrl_clf = clf_arch.ctrl_clf best_clf = clf_arch.classifiers[-1]