y_train, y_test = y[train_index], y[test_index] X_den_train, X_den_test = X_den[train_index], X_den[test_index] # feed models clf_mNB.fit(X_train, y_train) clf_ridge.fit(X_train, y_train) clf_SGD.fit(X_train, y_train) clf_lSVC.fit(X_train, y_train) clf_SVC.fit(X_train, y_train) # get prediction for this fold run prob_mNB = clf_mNB.predict_proba(X_test) prob_ridge = clf_ridge.decision_function(X_test) prob_SGD = clf_SGD.decision_function(X_test) prob_lSVC = clf_lSVC.decision_function(X_test) prob_SVC = clf_SVC.predict_proba(X_test) # add prob functions into the z 2d-array z_temp = (prob_mNB + prob_ridge + prob_SGD + prob_lSVC + prob_SVC) z = np.append(z, z_temp, axis=0) # remove the first sub-1d-array of z, due to the creation with 0s z = np.delete(z, 0, 0) # the result of z is a 2d array with shape of (n_samples, n_categories) # the elements are the sum of probabilities of classifiers on each (sample,category) pair print z print 'z shape: ', z.shape
y_train_train, y_train_test = y_train[train_index], y_train[test_index] # X_den_train, X_den_test = X_den[train_index], X_den[test_index] # feed models clf_mNB.fit(X_train_train, y_train_train) # clf_kNN.fit(X_train_train, y_train_train) clf_ridge.fit(X_train_train, y_train_train) clf_lSVC.fit(X_train_train, y_train_train) clf_SVC.fit(X_train_train, y_train_train) # get prediction for this fold run prob_mNB = clf_mNB.predict_proba(X_train_test) # prob_kNN = clf_kNN.predict_proba(X_train_test) prob_ridge = clf_ridge.decision_function(X_train_test) prob_lSVC = clf_lSVC.decision_function(X_train_test) prob_SVC = clf_SVC.predict_proba(X_train_test) # update z array for each model # z_temp = prob_lSVC # z_temp = (prob_ridge + prob_lSVC) z_temp = (prob_mNB + prob_ridge + prob_lSVC + prob_SVC) z = np.append(z, z_temp, axis=0) # remove the first sub-1d-array of z, due to the creation with 0s z = np.delete(z, 0, 0) # the result of z is a 2d array with shape of (n_samples, n_categories) # the elements are the sum of probabilities of classifiers on each (sample,category) pair # Possible preprocessing on z # z = normalize(z, norm="l2")
print doc for label in labels: # label[0]: score; label[1]: # print data_train.target_names[label[1]], label[0] print ##################################### # decision_function and predict_proba print clf_nb pred_prob = clf_nb.predict_proba(X_new) print pred_prob print print clf_lsvc pred_decision = clf_lsvc.decision_function(X_new) print pred_decision print print clf_svc # SVC should have the decision_function method, but got error: # error - ValueError: setting an array element with a sequence # pred_decision = clf_svc.decision_function(X_new) pred_prob = clf_svc.predict_proba(X_new) print pred_prob print print clf_sgd pred_decision = clf_sgd.decision_function(X_new) # pred_prob is only supported for binary classification! # pred_prob = clf_sgd.predict_proba(X_new)
# split ~140k into ~100k training and ~40k test ff_train, ff_val = split_dataframe(test_ff) print("Training...") t1 = time() vectorizer = CountVectorizer() train_counts = vectorizer.fit_transform(ff_train["TitlePlusBody"]) tfidf_transformer = TfidfTransformer(use_idf=False) # 98190x285052 train_tfidf_table = tfidf_transformer.fit_transform(train_counts) clf = LinearSVC().fit(train_tfidf_table, ff_train["OpenStatus"]) print("Testing...") test_counts = vectorizer.transform(ff_val["TitlePlusBody"]) test_tfidf_table = tfidf_transformer.transform(test_counts) predict = clf.predict(test_tfidf_table) print("np.mean: %f" % (np.mean(predict == ff_val["OpenStatus"]))) linear_decisions = clf.decision_function(test_tfidf_table) predicted_probs = (1 / (1 + np.exp(- linear_decisions))) ** 3.5 print("MCLL: %f" % (mcll(predicted_probs, ff_val["OpenStatus"].values))) t2 = time() print("done in %d seconds" % (t2 - t1))