def train(path, prefix="", out_dir=""): """ ---- path: str Path to serialized/pickeld training set prefix: prefix for .csv file with prediction results and serialized model out_dir: path to create output folder. """ print("annotating fasta") data = export_matrix(name=prefix, fasta_path=path, out_path=out_dir) data_ps = preprocess_and_scaledata(data, "llps") data_numeric = data_ps.select_dtypes([np.number]) X = data_numeric.drop("llps", axis=1) y = data_numeric["llps"] clf = RandomForestClassifier( n_jobs=32, class_weight="balanced", n_estimators=1200, criterion="entropy", random_state=42, ) clf.fit(X, y) # write model to json out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) skljson.to_json(clf, out_dir / f"psap_model_{prefix}.json")
def test_sklearn_deploy(self, mock_post, mock_put): model_name = 'test-model' mock_post_response = Mock() mock_post_response.json.return_value = {'url': 'http://test-url', 'tier': 0} mock_post_response.status_code = 200 mock_post.return_value = mock_post_response mock_put_response = Mock() mock_put_response.json.return_value = {} mock_put_response.status_code = 200 mock_put.return_value = mock_put_response if path.exists(model_name): clf = skljson.from_json(model_name) else: X, y = make_classification(n_samples=50, n_features=3, n_classes=3, n_informative=3, n_redundant=0, random_state=0, shuffle=False) clf = RandomForestClassifier() clf.fit(X, y) skljson.to_json(clf, model_name) sklearn = SKLearn('test') sklearn.deploy(clf, model_name) mock_post.assert_called_once() mock_put.assert_called_once()
def train(self, mType, outName=None, useJSON=False) -> bool: trainTarget = [] trainPred = [] self.modelType = mType for target, pred in self.trainDict.items(): for img in pred: trainTarget.append(target) trainPred.append(img) n = len(trainPred) trainPred = np.array(trainPred).reshape((n, -1)) if (mType.lower() == "svm"): model = SVC(gamma=0.001) model.fit(X=trainPred, y=trainTarget) elif (mType.lower() == "gnb"): model = GaussianNB() model.fit(X=trainPred, y=trainTarget) else: print("Supported types: GNB, SVM.") return False if (outName == None): outName = "models/" + self.game + mType.lower() if (useJSON): outName += ".json" skljson.to_json(model, outName) else: outName += ".pickle" with open(outName, 'wb') as f: pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL) return True
def train( path, prefix="", labels=None, out_dir="", ): """ ---- path: str Path to serialized/pickeld training set prefix: prefix for .csv file with prediction results and serialized model out_dir: path to create output folder. """ mat = export_matrix(prefix=prefix, fasta_path=path, out_path=out_dir) data = annotate(mat.df, labels=labels) y = data["llps"] data_ps = preprocess_and_scaledata(data) # re-add class column after scaling data_ps["llps"] = y data_numeric = data_ps.select_dtypes([np.number]) X = data_numeric.drop("llps", axis=1) y = data_numeric["llps"] # train random forest classifier logger.debug( "Training RF with {nin} instances and {nf} features", nf=len(X.columns), nin=len(X.index), ) clf = RandomForestClassifier( n_jobs=32, class_weight="balanced", n_estimators=1200, criterion="entropy", random_state=42, ) clf.fit(X, y) # write model to json out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) out_file = out_dir / f"psap_model_{prefix}.json" logger.info("Writing trained RF classifier to {json}", json=out_file) skljson.to_json(clf, out_dir / f"psap_model_{prefix}.json")
def test_sklearn_model_exceeded(self, mock_post): model_name = 'test-model-1mb' mock_post_response = Mock() mock_post_response.json.return_value = {'url': 'http://test-url', 'tier': 0} mock_post_response.status_code = 200 mock_post.return_value = mock_post_response if path.exists(model_name): clf = skljson.from_json(model_name) else: X, y = make_classification(n_samples=15000, n_features=10, n_classes=3, n_informative=3, n_redundant=0, random_state=0, shuffle=False) clf = RandomForestClassifier() clf.fit(X, y) skljson.to_json(clf, model_name) sklearn = SKLearn('test') with self.assertRaises(mlrequest.ModelSizeExceeded) as exception: sklearn.deploy(clf, model_name) mock_post.assert_called_once()
def check_sparse_model_json(self, model, model_name, abs=False): # Given if abs: model.fit(np.absolute(self.X_sparse), self.y_sparse) else: model.fit(self.X_sparse, self.y_sparse) # When serialized_model = skljson.to_json(model, model_name) deserialized_model = skljson.from_json(model_name) # Then expected_predictions = model.predict(self.X) actual_predictions = deserialized_model.predict(self.X) testing.assert_array_equal(expected_predictions, actual_predictions)
# sklearn rf-model, serialize as json import sklearn_json as skljson from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_wine from sklearn.model_selection import train_test_split ## load data wine = load_wine() ## split train/tmp3.json Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data, wine.target, test_size=0.3) model = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=0)\ .fit(Xtrain, Ytrain) ## to json skljson.to_json(model, "rf-model") ## from_json model2 = skljson.from_json("rf-model") score = model2.score(Xtest, Ytest) print(score)
def serialize_model(model, path): skljson.to_json(model, path)
def gridsearch(X_train, Y_train, X_val, Y_val, model, params, modelpath=None, resultspath=None, random_state=1618, n_jobs=1): """ Perform Gridsearch on candidate parameters and evaluate results on provided training and validation sets :param X_train: training dataframe :param Y_train: training labels :param X_val: validatoin dataframe :param Y_val: validation labels :param model: sklearn model :param params: parameters dictionary :param outputpath: path where model and results should be saved as pickle :param random_state: random state for stochastic models :returns: tuple of results list and the best model """ results = [] best_mcc = 0 model = model.lower() model_dict = { "svc": SVC, "lda": LinearDiscriminantAnalysis, "qda": QuadraticDiscriminantAnalysis, "logisticregression": LogisticRegression, "randomforest": RandomForestClassifier, "gradientboosting": GradientBoostingClassifier, "adaboost": AdaBoostClassifier, "knn": KNeighborsClassifier } stochastic = [ "svc", "logisticregression", "gradientboosting", "adaboost", "randomforest", "xgboost" ] isstochastic = (model in stochastic) if model == 'xgboost': train_dmat = xgb.DMatrix(X_train, Y_train) val_dmat = xgb.DMatrix(X_val, Y_val) # class imbalance ci = np.sum(Y_train == 0) / np.sum(Y_train == 1) params['scale_pos_weight'] = [ci, np.sqrt(ci)] params = param_list(params) for p in params: if isstochastic: if model == 'xgboost': p['seed'] = random_state else: p['random_state'] = random_state if model in ['logisticregression', 'knn', 'randomforest']: p['n_jobs'] = n_jobs # FIT if model == 'xgboost': p['nthread'] = n_jobs p['objective'] = 'binary:logistic' temp_model = xgb.train(p, train_dmat, num_boost_round=100, early_stopping_rounds=15, evals=[(train_dmat, 'train'), (val_dmat, 'validation')], verbose_eval=0) # EVALUATE Y_hat_train = (temp_model.predict(train_dmat) > 0.5) * 1 Y_hat_val = (temp_model.predict(val_dmat) > 0.5) * 1 else: temp_model = deepcopy(model_dict[model]()) temp_model.set_params(**p) temp_model.fit(X_train, Y_train) # EVALUATE Y_hat_train = temp_model.predict(X_train) Y_hat_val = temp_model.predict(X_val) (TN, FP), (FN, TP) = confusion_matrix(Y_train, Y_hat_train) t_acc = (TN + TP) / (TN + TP + FP + FN) t_sens = TP / (TP + FN) t_spec = TN / (TN + FP) t_mcc = (TP * TN - FP * FN) / np.sqrt( (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) (TN, FP), (FN, TP) = confusion_matrix(Y_val, Y_hat_val) v_acc = (TN + TP) / (TN + TP + FP + FN) v_sens = TP / (TP + FN) v_spec = TN / (TN + FP) v_mcc = (TP * TN - FP * FN) / np.sqrt( (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if v_mcc > best_mcc: best_model = deepcopy(temp_model) best_mcc = v_mcc results.append( [p, t_acc, t_sens, t_spec, t_mcc, v_acc, v_sens, v_spec, v_mcc]) results = pd.DataFrame(results, columns=[ 'params', 'train_accuracy', 'train_sensitivity', 'train_specificity', 'train_mcc', 'validation_accuracy', 'validation_sensitivity', 'validation_specificity', 'validation_mcc' ]) results = results.sort_values('validation_mcc', ascending=False) t_acc, tsens, t_spec, t_mcc, v_acc, v_sens, v_spec, v_mcc = results.iloc[ 0, 1:] print(f'Best model params: {results.iloc[0, 0]}') print( 'Train: Accuracy: {:3.3f}, Sensitivity: {:3.3f}, specificity: {:3.3f}, mcc: {:.3f}' .format(t_acc, t_sens, t_spec, t_mcc)) print( 'Validation: Accuracy: {:3.3f}, Sensitivity: {:3.3f}, specificity: {:3.3f}, mcc: {:.3f}' .format(v_acc, v_sens, v_spec, v_mcc)) print(best_model) if modelpath is not None: if model == 'xgboost': best_model.save_model(modelpath) else: sklearn_json.to_json(best_model, modelpath) if resultspath: results.to_csv(resultspath, sep='\t') return results, best_model
def export_model_to_file(self): import sklearn_json as skljson skljson.to_json(self.model_obj, self.get_path())
import pandas as pd df = pd.read_csv( 'https://storage.googleapis.com/kagglesdsdata/datasets/9590/13660/fruit_data_with_colors.txt?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210201%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210201T171906Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=30af8a4f1bb07bf9c4490a58dcfcf3580e901a7f19bf63ec258ee400c0d2dfea965e0c0c7e6db0ab7b57481a616fc2bc508f776c470bcc745f90df6bb0252c9482d23b30ffc99de8758d4e48fe6d38f1a044aa6b1cf2770a0799a7c7a7dfc58ebb526c60fdcb3e181e301ef5360433a8317cdaf752415863c73b9c10270dfd4bfaaf5a60c099cb13b5afd0c85c6518776bef2fbbb4115bd2c023c4db3ac1e14fe7549e5de4244bf48767830ef9fbc411e4ca97f93821027598226fc5725217cc24ed066281395826a740ec8e67beca8644aa35c523289b043597da5ebdd0122ff26226cdee3dc1d173c51c632e2dc2a88ebc032690182ceb68f3e1ecaff5904c', sep='\t') # Inputs and Output x = df.iloc[:, 3:7].values y = df.iloc[:, 1].values from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) from sklearn.linear_model import LogisticRegression model = LogisticRegression() model.fit(x_train, y_train) #pip install sklearn-json import sklearn_json as skljson import pickle model_name = "lr_model.json" scaler_name = "scaler.json" skljson.to_json(model, model_name) with open('scaler.json', 'wb') as f: pickle.dump(scaler, f)
#pip install sklearn-json import sklearn_json as skljson file_name = "abc.json" skljson.to_json(model, file_name) deserialized_model = skljson.from_json('abc.json')
# train with deicision tree clf = tree.DecisionTreeClassifier( criterion='gini' # , max_depth=5 , random_state=None # , min_samples_leaf=5 ) clf = clf.fit(Xtrain, Ytrain) print("train accu: " + str(clf.score(Xtrain, Ytrain))) print("tmp3.json accu: " + str(clf.score(Xtest, Ytest))) # show dot_data = tree.export_graphviz(clf, out_file=None, feature_names=wine.feature_names, class_names=wine.target_names, filled=True, rounded=True) graph = graphviz.Source(dot_data) graph.view("tree") # model to json skljson.to_json(clf, "tree_model") # =============================================== # =============================================== # ===============================================