def test_two_estimators_predict_proba1(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier()) pipeline.fit(self.X_train, self.y_train) pipeline.predict_proba(self.X_test)
def score_solution(model, save=0): ''' Added a model and save parameter: model ~ hold a classification model save ~ Flag used to save the best model on file using jobLib ''' # Ask the solution for the model pipeline. import solution pipeline = solution.get_pipeline(model) error_message = 'Your `solution.get_pipeline` implementation should ' \ 'return an `sklearn.pipeline.Pipeline`.' assert isinstance(pipeline, sklearn.pipeline.Pipeline), error_message # Train the model on the training DataFrame. X_train, y_train = get_data(subset='train') pipeline.fit(X_train, y_train) # Apply the model to the test DataFrame. X_test, y_test = get_data(subset='test') y_pred = pipeline.predict_proba(X_test) # Check that the predicted probabilities have an sklearn-compatible shape. assert (y_pred.ndim == 1) or \ (y_pred.ndim == 2 and y_pred.shape[1] == 2), \ 'The predicted probabilities should match sklearn''s ' \ '`predict_proba` output shape`.' y_pred = y_pred if y_pred.ndim == 1 else y_pred[:, 1] # Evaluate the predictions with the AUC of the ROC curve. if (save == 1): joblib.dump(pipeline, 'Best_Estimator.sav') return sklearn.metrics.roc_auc_score(y_test, y_pred)
def estimate_probability_multilabel(vectorizer, model, streamer): """ Generate probabilities for a multilabel binary estimator Arguments: * vectorizer: a sklearn Vectorizer (or pipeline) * model: a quantgov.estimator.Estimator * streamer: a quantgov.corpora.CorpusStreamer Yields: 2-tuples of docindex, probability """ pipeline = get_pipeline(vectorizer, model) texts = (doc.text for doc in streamer) try: truecols = tuple( list(int(i) for i in label_classes).index(1) for label_classes in model.model.classes_) except AttributeError: truecols = tuple( list(int(i) for i in label_classes).index(1) for label_classes in ( est.classes_ for est in model.model.steps[-1][-1].estimators_)) predicted = pipeline.predict_proba(texts) for i, docidx in enumerate(streamer.index): yield docidx, tuple(label_predictions[i, truecols[j]] for j, label_predictions in enumerate(predicted))
def test_multiple_estimators_predict_predict_proba(self): pipeline = (StandardScaler() >> (LogisticRegression() & PCA()) >> ConcatFeatures() >> (NoOp() & LinearSVC()) >> ConcatFeatures() >> KNeighborsClassifier()) pipeline.fit(self.X_train, self.y_train) _ = pipeline.predict_proba(self.X_test) _ = pipeline.predict(self.X_test)
def estimate_probability_multiclass(vectorizer, model, streamer): """ Generate probabilities for a one-label, multiclass estimator Arguments: * vectorizer: a sklearn Vectorizer (or pipeline) * model: a quantgov.estimator.Estimator * streamer: a quantgov.corpora.CorpusStreamer Yields: 2-tuples of docindex, probability """ pipeline = get_pipeline(vectorizer, model) texts = (doc.text for doc in streamer) yield from zip(streamer.index, pipeline.predict_proba(texts))
def run_pipeline(df, pipeline, pipeline_name=''): X = pd.Series(df["text"]) y = preprocessing.LabelEncoder().fit_transform(df.author.values) rskf = StratifiedKFold(n_splits=5, random_state=1) losses = [] for train_index, test_index in rskf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] pipeline.fit(X_train, y_train) predictions = pipeline.predict_proba(X_test) log_loss = metrics.log_loss(y_test, predictions) losses.append(log_loss) print(" Log loss: " + str(log_loss)) print(" Accuracy : %0.3f " % calculate_accuracy(y_test, predictions)) print(f'{pipeline_name} mean log loss: {round(pd.np.mean(losses), 3)}')
def estimate_probability(vectorizer, model, streamer): """ Generate probabilities for a one-label estimator Arguments: * vectorizer: a sklearn Vectorizer (or pipeline) * model: a quantgov.estimator.Estimator * streamer: a quantgov.corpora.CorpusStreamer Yields: 2-tuples of docindex, probability """ pipeline = get_pipeline(vectorizer, model) texts = (doc.text for doc in streamer) truecol = list(int(i) for i in model.model.classes_).index(1) predicted = (i[truecol] for i in pipeline.predict_proba(texts)) yield from zip(streamer.index, predicted)
def estimate_probability_multilabel_multiclass(vectorizer, model, streamer): """ Generate probabilities for a multilabel, multiclass estimator Arguments: * vectorizer: a sklearn Vectorizer (or pipeline) * model: a quantgov.estimator.Estimator * streamer: a quantgov.corpora.CorpusStreamer Yields: 2-tuples of docindex, probability """ pipeline = get_pipeline(vectorizer, model) texts = (doc.text for doc in streamer) predicted = pipeline.predict_proba(texts) for i, docidx in enumerate(streamer.index): yield docidx, tuple(label_predictions[i] for label_predictions in predicted)
def score_solution(): # Ask the solution for the model pipeline. import solution pipeline = solution.get_pipeline() error_message = 'Your `solution.get_pipeline` implementation should ' \ 'return an `sklearn.pipeline.Pipeline`.' assert isinstance(pipeline, sklearn.pipeline.Pipeline), error_message # Train the model on the training DataFrame. X_train, y_train = get_data(subset='train') pipeline.fit(X_train, y_train) # Apply the model to the test DataFrame. X_test, y_test = get_data(subset='test') y_pred = pipeline.predict_proba(X_test) # Check that the predicted probabilities have an sklearn-compatible shape. assert (y_pred.ndim == 1) or \ (y_pred.ndim == 2 and y_pred.shape[1] == 2), \ 'The predicted probabilities should match sklearn''s ' \ '`predict_proba` output shape.' y_pred = y_pred if y_pred.ndim == 1 else y_pred[:, 1] # Evaluate the predictions with the AUC of the ROC curve. return sklearn.metrics.roc_auc_score(y_test, y_pred)
import onnxruntime as rt import joblib from numpy import load import sklearn.pipeline sess = rt.InferenceSession("output/model.onnx") train_data = load("train_data.npy", allow_pickle=True) print('---', train_data[0]) inputs = {'input': train_data[:1]} pred_onx = sess.run(None, inputs) print("onnx predict_proba") print("predict", pred_onx[0]) print("predict_proba", pred_onx[1]) print("skl predict_proba") print("predict", pipeline.predict(train_data[:1])) print("predict_proba", pipeline.predict_proba(train_data[:1]))
clf__min_samples_leaf=range(2, 4, 1), clf__min_weight_fraction_leaf=[0], ) #grid_search = sklearn.grid_search.GridSearchCV( # pipeline, n_jobs=1, param_grid=param_grid, verbose=100, # scoring=youdenJ,score_func=youdenJ, # cv=sklearn.cross_validation.PredefinedSplit(testidx)) #grid_search.fit(trainFact[:,rfecv.support_], labels) #results1=([sklearn.metrics.confusion_matrix(labels,grid_search.best_estimator_.predict(train))]) #grid_search_results1=(grid_search.grid_scores_) #kwargs=grid_search.best_params_ #pipeline.set_params(**kwargs) pipeline.fit(train[train.columns[rfecv.support_]], labels) predictions = (pipeline.predict_proba( test[train.columns[rfecv.support_]])[:, 1] >= 0.02) * 1 predictionstrain = (pipeline.predict_proba( train[train.columns[rfecv.support_]])[:, 1] >= 0.02) * 1 print Youdens_func(labels, predictionstrain) # create predictions and submission file sample['WnvPresent'] = predictions sample.to_csv('testpredicts5.csv', index=False) print sum(predictions) #%% ##########################ROC Plots ########################################### for yr in [2007, 2009, 2011, 2013]: pipeline.fit(train[train.year != yr][train.columns[rfecv.support_]],
print("Classifying unlabeled data done in: %fs" % (time()-t0)) print(report) kfeatures = np.asarray(selector.get_support(indices=True)) print(np.asarray(vectorizer.get_feature_names())[kfeatures]) ################################################################# ###### 3. Use classifier on unlabelled data pred_unlab = pipeline.predict(X_matrix_unlab).tolist() directory = 'results' if not os.path.exists(directory): os.makedirs(directory) probs = np.asmatrix(pipeline.predict_proba(X_matrix_unlab)) for i in range(len(unlabeled_titles)): m = max(max(probs[i,:].tolist())) if m < .5: filename = directory + '/unsorted' output_file = open(filename, 'a') output_file.write("%s\n" % unlabeled_titles[i]) output_file.close() else: filename = directory + '/{}'.format(pred_unlab[i]) output_file = open(filename, 'a') output_file.write("%s\n" % unlabeled_titles[i]) output_file.close()
else: X_resampled = X_train y_resampled = y_train t0 = time.clock() pipeline.fit(X_resampled, y_resampled) time_to_fit = (time.clock() - t0) print("done fitting in {}".format(time_to_fit)) ''' Predictions ''' predicted = pipeline.predict(X_test) try: predicted_prob = pipeline.predict_proba(X_test) predicted_prob = predicted_prob[:, 1] # probability that label is 1 except: print("Model has no predict_proba method") ''' Evaluation Statistics ''' print() print("Evaluation Statistics") if model_name=='KNN': print("Getting feature support") features = pipeline.named_steps['feat'] print(X_train.columns[features.transform(np.arange( len(X_train.columns)))])
clf__min_weight_fraction_leaf=[0], ) #grid_search = sklearn.grid_search.GridSearchCV( # pipeline, n_jobs=1, param_grid=param_grid, verbose=100, # scoring=youdenJ,score_func=youdenJ, # cv=sklearn.cross_validation.PredefinedSplit(testidx)) #grid_search.fit(trainFact[:,rfecv.support_], labels) #results1=([sklearn.metrics.confusion_matrix(labels,grid_search.best_estimator_.predict(train))]) #grid_search_results1=(grid_search.grid_scores_) #kwargs=grid_search.best_params_ #pipeline.set_params(**kwargs) pipeline.fit(train[train.columns[rfecv.support_]],labels) predictions=(pipeline.predict_proba(test[train.columns[rfecv.support_]])[:,1]>=0.02)*1 predictionstrain=(pipeline.predict_proba(train[train.columns[rfecv.support_]])[:,1]>=0.02)*1 print Youdens_func(labels,predictionstrain) # create predictions and submission file sample['WnvPresent'] = predictions sample.to_csv('testpredicts5.csv', index=False) print sum(predictions) #%%