Пример #1
0
def train(path, prefix="", out_dir=""):
    """
    ----
    path: str
        Path to serialized/pickeld training set
    prefix:
        prefix for .csv file with prediction results and serialized model
    out_dir:
        path to create output folder.
    """
    print("annotating fasta")
    data = export_matrix(name=prefix, fasta_path=path, out_path=out_dir)
    data_ps = preprocess_and_scaledata(data, "llps")
    data_numeric = data_ps.select_dtypes([np.number])
    X = data_numeric.drop("llps", axis=1)
    y = data_numeric["llps"]
    clf = RandomForestClassifier(
        n_jobs=32,
        class_weight="balanced",
        n_estimators=1200,
        criterion="entropy",
        random_state=42,
    )
    clf.fit(X, y)
    # write model to json
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    skljson.to_json(clf, out_dir / f"psap_model_{prefix}.json")
Пример #2
0
    def test_sklearn_deploy(self, mock_post, mock_put):
        model_name = 'test-model'

        mock_post_response = Mock()
        mock_post_response.json.return_value = {'url': 'http://test-url', 'tier': 0}
        mock_post_response.status_code = 200
        mock_post.return_value = mock_post_response

        mock_put_response = Mock()
        mock_put_response.json.return_value = {}
        mock_put_response.status_code = 200
        mock_put.return_value = mock_put_response

        if path.exists(model_name):
            clf = skljson.from_json(model_name)
        else:
            X, y = make_classification(n_samples=50, n_features=3, n_classes=3, n_informative=3, n_redundant=0,
                                       random_state=0, shuffle=False)
            clf = RandomForestClassifier()
            clf.fit(X, y)
            skljson.to_json(clf, model_name)

        sklearn = SKLearn('test')
        sklearn.deploy(clf, model_name)

        mock_post.assert_called_once()
        mock_put.assert_called_once()
Пример #3
0
 def train(self, mType, outName=None, useJSON=False) -> bool:
     trainTarget = []
     trainPred = []
     self.modelType = mType
     for target, pred in self.trainDict.items():
         for img in pred:
             trainTarget.append(target)
             trainPred.append(img)
     n = len(trainPred)
     trainPred = np.array(trainPred).reshape((n, -1))
     if (mType.lower() == "svm"):
         model = SVC(gamma=0.001)
         model.fit(X=trainPred, y=trainTarget)
     elif (mType.lower() == "gnb"):
         model = GaussianNB()
         model.fit(X=trainPred, y=trainTarget)
     else:
         print("Supported types: GNB, SVM.")
         return False
     if (outName == None):
         outName = "models/" + self.game + mType.lower()
     if (useJSON):
         outName += ".json"
         skljson.to_json(model, outName)
     else:
         outName += ".pickle"
         with open(outName, 'wb') as f:
             pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)
     return True
Пример #4
0
def train(
    path,
    prefix="",
    labels=None,
    out_dir="",
):
    """
    ----
    path: str
        Path to serialized/pickeld training set
    prefix:
        prefix for .csv file with prediction results and serialized model
    out_dir:
        path to create output folder.
    """
    mat = export_matrix(prefix=prefix, fasta_path=path, out_path=out_dir)
    data = annotate(mat.df, labels=labels)
    y = data["llps"]
    data_ps = preprocess_and_scaledata(data)
    # re-add class column after scaling
    data_ps["llps"] = y
    data_numeric = data_ps.select_dtypes([np.number])
    X = data_numeric.drop("llps", axis=1)
    y = data_numeric["llps"]
    # train random forest classifier
    logger.debug(
        "Training RF with {nin} instances and {nf} features",
        nf=len(X.columns),
        nin=len(X.index),
    )
    clf = RandomForestClassifier(
        n_jobs=32,
        class_weight="balanced",
        n_estimators=1200,
        criterion="entropy",
        random_state=42,
    )
    clf.fit(X, y)
    # write model to json
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    out_file = out_dir / f"psap_model_{prefix}.json"
    logger.info("Writing trained RF classifier to {json}", json=out_file)
    skljson.to_json(clf, out_dir / f"psap_model_{prefix}.json")
Пример #5
0
    def test_sklearn_model_exceeded(self, mock_post):
        model_name = 'test-model-1mb'

        mock_post_response = Mock()
        mock_post_response.json.return_value = {'url': 'http://test-url', 'tier': 0}
        mock_post_response.status_code = 200
        mock_post.return_value = mock_post_response

        if path.exists(model_name):
            clf = skljson.from_json(model_name)
        else:
            X, y = make_classification(n_samples=15000, n_features=10, n_classes=3, n_informative=3, n_redundant=0,
                                       random_state=0, shuffle=False)
            clf = RandomForestClassifier()
            clf.fit(X, y)
            skljson.to_json(clf, model_name)

        sklearn = SKLearn('test')
        with self.assertRaises(mlrequest.ModelSizeExceeded) as exception:
            sklearn.deploy(clf, model_name)
        mock_post.assert_called_once()
Пример #6
0
    def check_sparse_model_json(self, model, model_name, abs=False):
        # Given
        if abs:
            model.fit(np.absolute(self.X_sparse), self.y_sparse)
        else:
            model.fit(self.X_sparse, self.y_sparse)

        # When
        serialized_model = skljson.to_json(model, model_name)
        deserialized_model = skljson.from_json(model_name)

        # Then
        expected_predictions = model.predict(self.X)
        actual_predictions = deserialized_model.predict(self.X)

        testing.assert_array_equal(expected_predictions, actual_predictions)
Пример #7
0
# sklearn rf-model, serialize as json

import sklearn_json as skljson
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

## load data
wine = load_wine()

## split train/tmp3.json
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,
                                                wine.target,
                                                test_size=0.3)

model = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=0)\
    .fit(Xtrain, Ytrain)

## to json
skljson.to_json(model, "rf-model")

## from_json
model2 = skljson.from_json("rf-model")
score = model2.score(Xtest, Ytest)
print(score)
Пример #8
0
 def serialize_model(model, path):
     skljson.to_json(model, path)
Пример #9
0
def gridsearch(X_train,
               Y_train,
               X_val,
               Y_val,
               model,
               params,
               modelpath=None,
               resultspath=None,
               random_state=1618,
               n_jobs=1):
    """
    Perform Gridsearch on candidate parameters and evaluate results on
    provided training and validation sets

    :param X_train: training dataframe
    :param Y_train: training labels
    :param X_val: validatoin dataframe
    :param Y_val: validation labels
    :param model: sklearn model
    :param params: parameters dictionary
    :param outputpath: path where model and results should be saved as pickle
    :param random_state: random state for stochastic models
    :returns: tuple of results list and the best model
    """

    results = []
    best_mcc = 0

    model = model.lower()

    model_dict = {
        "svc": SVC,
        "lda": LinearDiscriminantAnalysis,
        "qda": QuadraticDiscriminantAnalysis,
        "logisticregression": LogisticRegression,
        "randomforest": RandomForestClassifier,
        "gradientboosting": GradientBoostingClassifier,
        "adaboost": AdaBoostClassifier,
        "knn": KNeighborsClassifier
    }

    stochastic = [
        "svc", "logisticregression", "gradientboosting", "adaboost",
        "randomforest", "xgboost"
    ]

    isstochastic = (model in stochastic)

    if model == 'xgboost':
        train_dmat = xgb.DMatrix(X_train, Y_train)
        val_dmat = xgb.DMatrix(X_val, Y_val)

        # class imbalance
        ci = np.sum(Y_train == 0) / np.sum(Y_train == 1)
        params['scale_pos_weight'] = [ci, np.sqrt(ci)]

    params = param_list(params)

    for p in params:
        if isstochastic:
            if model == 'xgboost':
                p['seed'] = random_state
            else:
                p['random_state'] = random_state

        if model in ['logisticregression', 'knn', 'randomforest']:
            p['n_jobs'] = n_jobs

        # FIT
        if model == 'xgboost':
            p['nthread'] = n_jobs
            p['objective'] = 'binary:logistic'
            temp_model = xgb.train(p,
                                   train_dmat,
                                   num_boost_round=100,
                                   early_stopping_rounds=15,
                                   evals=[(train_dmat, 'train'),
                                          (val_dmat, 'validation')],
                                   verbose_eval=0)

            # EVALUATE
            Y_hat_train = (temp_model.predict(train_dmat) > 0.5) * 1
            Y_hat_val = (temp_model.predict(val_dmat) > 0.5) * 1

        else:
            temp_model = deepcopy(model_dict[model]())
            temp_model.set_params(**p)

            temp_model.fit(X_train, Y_train)

            # EVALUATE
            Y_hat_train = temp_model.predict(X_train)
            Y_hat_val = temp_model.predict(X_val)

        (TN, FP), (FN, TP) = confusion_matrix(Y_train, Y_hat_train)
        t_acc = (TN + TP) / (TN + TP + FP + FN)
        t_sens = TP / (TP + FN)
        t_spec = TN / (TN + FP)
        t_mcc = (TP * TN - FP * FN) / np.sqrt(
            (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

        (TN, FP), (FN, TP) = confusion_matrix(Y_val, Y_hat_val)
        v_acc = (TN + TP) / (TN + TP + FP + FN)
        v_sens = TP / (TP + FN)
        v_spec = TN / (TN + FP)
        v_mcc = (TP * TN - FP * FN) / np.sqrt(
            (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

        if v_mcc > best_mcc:
            best_model = deepcopy(temp_model)
            best_mcc = v_mcc

        results.append(
            [p, t_acc, t_sens, t_spec, t_mcc, v_acc, v_sens, v_spec, v_mcc])

    results = pd.DataFrame(results,
                           columns=[
                               'params', 'train_accuracy', 'train_sensitivity',
                               'train_specificity', 'train_mcc',
                               'validation_accuracy', 'validation_sensitivity',
                               'validation_specificity', 'validation_mcc'
                           ])

    results = results.sort_values('validation_mcc', ascending=False)

    t_acc, tsens, t_spec, t_mcc, v_acc, v_sens, v_spec, v_mcc = results.iloc[
        0, 1:]

    print(f'Best model params: {results.iloc[0, 0]}')
    print(
        'Train:      Accuracy: {:3.3f}, Sensitivity: {:3.3f}, specificity: {:3.3f}, mcc: {:.3f}'
        .format(t_acc, t_sens, t_spec, t_mcc))
    print(
        'Validation: Accuracy: {:3.3f}, Sensitivity: {:3.3f}, specificity: {:3.3f}, mcc: {:.3f}'
        .format(v_acc, v_sens, v_spec, v_mcc))
    print(best_model)

    if modelpath is not None:
        if model == 'xgboost':
            best_model.save_model(modelpath)
        else:
            sklearn_json.to_json(best_model, modelpath)

    if resultspath:
        results.to_csv(resultspath, sep='\t')

    return results, best_model
Пример #10
0
 def export_model_to_file(self):
     import sklearn_json as skljson
     skljson.to_json(self.model_obj, self.get_path())
import pandas as pd
df = pd.read_csv(
    'https://storage.googleapis.com/kagglesdsdata/datasets/9590/13660/fruit_data_with_colors.txt?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210201%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210201T171906Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=30af8a4f1bb07bf9c4490a58dcfcf3580e901a7f19bf63ec258ee400c0d2dfea965e0c0c7e6db0ab7b57481a616fc2bc508f776c470bcc745f90df6bb0252c9482d23b30ffc99de8758d4e48fe6d38f1a044aa6b1cf2770a0799a7c7a7dfc58ebb526c60fdcb3e181e301ef5360433a8317cdaf752415863c73b9c10270dfd4bfaaf5a60c099cb13b5afd0c85c6518776bef2fbbb4115bd2c023c4db3ac1e14fe7549e5de4244bf48767830ef9fbc411e4ca97f93821027598226fc5725217cc24ed066281395826a740ec8e67beca8644aa35c523289b043597da5ebdd0122ff26226cdee3dc1d173c51c632e2dc2a88ebc032690182ceb68f3e1ecaff5904c',
    sep='\t')
# Inputs and Output
x = df.iloc[:, 3:7].values
y = df.iloc[:, 1].values
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)
#pip install sklearn-json

import sklearn_json as skljson
import pickle
model_name = "lr_model.json"
scaler_name = "scaler.json"
skljson.to_json(model, model_name)

with open('scaler.json', 'wb') as f:
    pickle.dump(scaler, f)
#pip install sklearn-json

import sklearn_json as skljson

file_name = "abc.json"
skljson.to_json(model, file_name)

deserialized_model = skljson.from_json('abc.json')
Пример #13
0
# train with deicision tree
clf = tree.DecisionTreeClassifier(
    criterion='gini'
    # , max_depth=5
    ,
    random_state=None
    # , min_samples_leaf=5
)
clf = clf.fit(Xtrain, Ytrain)

print("train accu: " + str(clf.score(Xtrain, Ytrain)))
print("tmp3.json  accu: " + str(clf.score(Xtest, Ytest)))

# show
dot_data = tree.export_graphviz(clf,
                                out_file=None,
                                feature_names=wine.feature_names,
                                class_names=wine.target_names,
                                filled=True,
                                rounded=True)

graph = graphviz.Source(dot_data)
graph.view("tree")

# model to json
skljson.to_json(clf, "tree_model")

# ===============================================
# ===============================================
# ===============================================