示例#1
0
    t0 = time()
    
    vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
    X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("C:\\Users\\Mustafa\\Desktop\\aclImdb", shuffle=True, vectorizer=vect)
    feature_names = vect.get_feature_names()
    
    duration = time() - t0

    print
    print "Loading took %0.2fs." % duration
    print
    
    print "Fitting the classifier"
    
    t0 = time()
    clf = TransparentLogisticRegression(penalty='l1', C=0.1)
    clf.fit(X_train, y_train)
    
    duration = time() - t0

    print
    print "Fitting took %0.2fs." % duration
    print
    
    print "Predicting the evidences"
    
    t0 = time()
    neg_evi, pos_evi = clf.predict_evidences(X_test)
    
    duration = time() - t0
示例#2
0
def testLR():
    
    print "Loading the data"
    
    t0 = time()
    
    vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
    X_train, y_train, X_test, y_test, train_corpus, test_corpus = load_imdb("C:\\Users\\Mustafa\\Desktop\\aclImdb", shuffle=True, vectorizer=vect)
    feature_names = vect.get_feature_names()
    
    duration = time() - t0

    print
    print "Loading took %0.2fs." % duration
    print
    
    print "Fitting the classifier"
    
    t0 = time()
    clf = TransparentLogisticRegression(penalty='l1', C=0.1)
    clf.fit(X_train, y_train)
    
    duration = time() - t0

    print
    print "Fitting took %0.2fs." % duration
    print
    
    print "Predicting the evidences"
    
    t0 = time()
    neg_evi, pos_evi = clf.predict_evidences(X_test)
    
    duration = time() - t0

    print
    print "Predicting evidences took %0.2fs." % duration
    print
    
    print "Predicting the probs"
    
    t0 = time()
    probs = clf.predict_proba(X_test)
    
    duration = time() - t0

    print
    print "Predicting probs took %0.2fs." % duration
    print
    
    ti = TopInstances(neg_evi, pos_evi, clf.get_bias())
    
    total_evi = neg_evi + pos_evi
    
    print
    print "Most negative"
    print
    i = ti.most_negatives()[0]
    print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
    print test_corpus[i]
    
    print
    print "Most positive"
    print
    i = ti.most_positives()[0]
    print total_evi[i], neg_evi[i], pos_evi[i], probs[i]
    print test_corpus[i]
print "The # of positive instances in all data: ", np.sum(y)
print "The ratio of positive instances: ", np.sum(y)/float(num_inst)
print "The ratio of negative instances: ", 1-(np.sum(y)/float(len(y)))
print ""

ss = ShuffleSplit(num_inst, n_iter=1, test_size=0.33, random_state=2)

for i, j in ss:
    train_index = i
    test_index = j

print "Y_train total:", len(y[train_index])
print "Y_train positive:", np.sum(y[train_index])
print "Ratio of positive in train instances", np.sum(y[train_index])/float(len(y[train_index]))

clf_ori = TransparentLogisticRegression()
clf_ss = TransparentLogisticRegression()
clf_ig = TransparentLogisticRegression()


X_ss = scale(X)
scale_ = decision_tree_scale()

X_ig = scale_.fit_transform(X, y)

clf_ori.fit(X, y)
clf_ss.fit(X_ss, y)
clf_ig.fit(X_ig, y)

print clf_ori.coef_
print clf_ss.coef_
示例#4
0
import numpy as np
import pickle

t0 = time()

vect = Vectorizer(min_df=5, max_df=1.0, binary=False, ngram_range=(1, 1))

X_train, y_train, X_val, y_val, X_test, y_test, train_corpus, val_corpus, test_corpus = load_imdb("./aclImdb", shuffle=True, vectorizer=vect)

y_test_na = y_test[:, np.newaxis]
y_test_na = np.append(y_test_na, 1-y_test_na, axis=1)

y_val_na = y_val[:, np.newaxis]
y_val_na = np.append(y_val_na, 1-y_val_na, axis=1)

clf = Classifier()
clf.fit(X_train, y_train)
ctrl_clf = clf
ctrl_error = ce_squared(y_test_na, clf.predict_proba(X_test))
ctrl_acc = clf.score(X_test, y_test)

duration = time() - t0
print("Loading the dataset took {:0.2f}s.".format(duration), '\n')

with open('clf12.arch', 'rb') as f:
    clf_arch = pickle.load(f)

clf_arch.stats()

ctrl_clf = clf_arch.ctrl_clf
best_clf = clf_arch.classifiers[-1]