예제 #1
0
    def test_sklearn_deploy(self, mock_post, mock_put):
        model_name = 'test-model'

        mock_post_response = Mock()
        mock_post_response.json.return_value = {'url': 'http://test-url', 'tier': 0}
        mock_post_response.status_code = 200
        mock_post.return_value = mock_post_response

        mock_put_response = Mock()
        mock_put_response.json.return_value = {}
        mock_put_response.status_code = 200
        mock_put.return_value = mock_put_response

        if path.exists(model_name):
            clf = skljson.from_json(model_name)
        else:
            X, y = make_classification(n_samples=50, n_features=3, n_classes=3, n_informative=3, n_redundant=0,
                                       random_state=0, shuffle=False)
            clf = RandomForestClassifier()
            clf.fit(X, y)
            skljson.to_json(clf, model_name)

        sklearn = SKLearn('test')
        sklearn.deploy(clf, model_name)

        mock_post.assert_called_once()
        mock_put.assert_called_once()
예제 #2
0
 def deserialize_model(path):
     # Load (or deserialize) model from JSON
     model = skljson.from_json(path)
     # Convert coeficients to numpy arrays to enable JSON deserialization
     # This is a hack to compensate for a bug in sklearn_json
     for i, x in enumerate(model.coefs_):
         model.coefs_[i] = np.array(x)
     return model
예제 #3
0
 def __init__(self, modelFile, game, config="configs/config.ini"):
     ext = modelFile.split(sep='.')[-1]
     self.model = None
     self.game = game
     if (config == None):
         self.config = "configs/config.ini"
     else:
         self.config = config
     if (ext == "json"):
         self.model = (skljson.from_json(modelFile))
     elif (ext == "pickle"):
         with open(modelFile, 'rb') as f:
             self.model = pickle.load(f)
     else:
         print("Unknown extention")
예제 #4
0
    def check_sparse_model_json(self, model, model_name, abs=False):
        # Given
        if abs:
            model.fit(np.absolute(self.X_sparse), self.y_sparse)
        else:
            model.fit(self.X_sparse, self.y_sparse)

        # When
        serialized_model = skljson.to_json(model, model_name)
        deserialized_model = skljson.from_json(model_name)

        # Then
        expected_predictions = model.predict(self.X)
        actual_predictions = deserialized_model.predict(self.X)

        testing.assert_array_equal(expected_predictions, actual_predictions)
예제 #5
0
def predict(
    path="",
    model=None,
    prefix="",
    out_dir="",
):
    """
    ----
    path: str
        Path to serialized/pickeld training set
    model: sklearn model
        Path to serialized RandomForest classifier (trained)
    prefix:
        prefix for .csv file with prediction results and serialized model
    out_dir:
        path to create output folder.
    """
    if model is None:
        model = Path(__file__).parent / "data/model/UP000005640_9606_llps.json"
    try:
        logger.info("Loading model: {m}", m=model)
        clf = skljson.from_json(model)
    except Exception:
        logger.error("classifier {mod} not found. Does the file exist?",
                     mod=model)
    mat = export_matrix(prefix=prefix, fasta_path=path, out_path=out_dir)
    # Preprocessing
    data_ps = preprocess_and_scaledata(mat.df)
    X = data_ps.select_dtypes([np.number])
    logger.info("Predicting PSAP_score")
    psap_prediction = pd.DataFrame(index=data_ps["protein_name"])
    psap_prediction["PSAP_score"] = clf.predict_proba(X)[:, 1]
    psap_prediction["rank"] = 0
    rank = psap_prediction["PSAP_score"].rank(ascending=False)
    psap_prediction["rank"] = rank
    # # Make directory for output
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    out_file = out_dir / f"prediction_{prefix}.csv"
    logger.info("Writing results to: {csv}", csv=out_file)
    psap_prediction.to_csv(out_file)
예제 #6
0
    def test_sklearn_model_exceeded(self, mock_post):
        model_name = 'test-model-1mb'

        mock_post_response = Mock()
        mock_post_response.json.return_value = {'url': 'http://test-url', 'tier': 0}
        mock_post_response.status_code = 200
        mock_post.return_value = mock_post_response

        if path.exists(model_name):
            clf = skljson.from_json(model_name)
        else:
            X, y = make_classification(n_samples=15000, n_features=10, n_classes=3, n_informative=3, n_redundant=0,
                                       random_state=0, shuffle=False)
            clf = RandomForestClassifier()
            clf.fit(X, y)
            skljson.to_json(clf, model_name)

        sklearn = SKLearn('test')
        with self.assertRaises(mlrequest.ModelSizeExceeded) as exception:
            sklearn.deploy(clf, model_name)
        mock_post.assert_called_once()
예제 #7
0
def load_serialized(filename_with_path):
    """ load a serialized model """
    if not os.path.isfile(filename_with_path):
        print(
            "{} is not a valid file, please check".format(filename_with_path))
        return
    feature_selection_filename = filename_with_path.replace(
        "_reg.json", "_fs.json")
    feature_selection = None
    if os.path.isfile(feature_selection_filename):
        feature_selection = json_load(feature_selection_filename)

    regressor = skljson.from_json(filename_with_path)
    # override n_jobs to prevent warning, model should be fast enough
    # n_jobs helps during training
    regressor.n_jobs = 1

    class Model:
        """ wrapper to the serialized scikit learn model,
        that uses feature selection in the first step
        """
        def __init__(self, regressor, fs=None):
            self._regressor = regressor
            self._fs = fs

        def feature_select(self, X):
            fs = np.array(self._fs)
            X = np.array(X)
            _X = []
            # perform selection for each input row
            for x in X:
                _X.append(x[fs])
            return _X

        def predict(self, X):
            if self._fs:
                X = self.feature_select(X)
            return self._regressor.predict(X)

    return Model(regressor, feature_selection)
예제 #8
0
def conOutFinal(cfile, order):
    a = open(cfile.replace('.tocnf', '.possum'), 'w')
    try:

        specMODEL = skljson.from_json(
            pkg_resources.resource_filename(
                'POSMM', 'models/species_' + str(order) + '.json'))
        genMODEL = skljson.from_json(
            pkg_resources.resource_filename(
                'POSMM', 'models/genus_' + str(order) + '.json'))
        famMODEL = skljson.from_json(
            pkg_resources.resource_filename(
                'POSMM', 'models/family_' + str(order) + '.json'))
        claMODEL = skljson.from_json(
            pkg_resources.resource_filename(
                'POSMM', 'models/class_' + str(order) + '.json'))
        ordMODEL = skljson.from_json(
            pkg_resources.resource_filename(
                'POSMM', 'models/order_' + str(order) + '.json'))
        phyMODEL = skljson.from_json(
            pkg_resources.resource_filename(
                'POSMM', 'models/phylum_' + str(order) + '.json'))
    except:
        print('ERROR: Models Missing.  Aborting')
        sys.exit()
    modlist = [phyMODEL, claMODEL, ordMODEL, famMODEL, genMODEL, specMODEL]
    matchpos = []
    for m in modlist:
        mp = (list(m.classes_)).index('Match')
        matchpos.append(mp)
    with open(cfile) as infile:
        for lines in infile:
            lines = lines.rstrip()
            values = lines.split('\t')
            taxid = values[2]
            lineage = glin[taxid][1:]
            rawscore = float(values[1])
            readlen = int(values[0])
            wrstr = []
            for x in range(len(modlist)):
                cfs = modlist[x].predict_proba([[rawscore,
                                                 readlen]])[0][matchpos[x]]
                wrstr.append(lineage[x] + ':::' + str(cfs))
            a.write('\t'.join(wrstr) + '\n')
    a.close()
예제 #9
0
def open_model(model_path):
    """Open and return a model from json file
    
    :param model_path: path to the model
    """
    if model_path.endswith('gz'):
        with gzip.open(model_path, 'r') as f:
            model = f.read()
            model = json.loads(model.decode('utf-8'))
            model = from_dict(model)
        return model

    else:
        with open(model_path, 'r') as f:
            a = f.readline()

        if a.startswith('{"learner"'):
            model = xgb.Booster()
            model.load_model(model_path)
            return model

        else:
            model = from_json(model_path)
            return model
예제 #10
0
def predict(path, model, prefix="", out_dir=""):
    """
    ----
    path: str
        Path to serialized/pickeld training set
    model: sklearn model
        Path to serialized RandomForest classifier (trained)
    prefix:
        prefix for .csv file with prediction results and serialized model
    out_dir:
        path to create output folder.
    """
    print("Loading model")
    print(model)
    try:
        clf = skljson.from_json(model)
    except:
        print("An error occured while importing the model from json")
    print("annotating fasta")
    data = export_matrix(name=prefix, fasta_path=path, out_path=out_dir)
    # Preprocessing
    data_ps = preprocess_and_scaledata(data, "llps")
    data_numeric = data_ps.select_dtypes([np.number])
    X = data_numeric.drop("llps", axis=1)
    y = data_numeric["llps"]
    psap_prediction = pd.DataFrame(index=data["protein_name"])
    psap_prediction["PSAP_score"] = clf.predict_proba(X)[:, 1]
    psap_prediction["llps"] = y.values
    psap_prediction["rank"] = 0
    rank = psap_prediction.loc[psap_prediction["llps"] == 0,
                               "PSAP_score"].rank(ascending=False)
    psap_prediction["rank"] = rank
    # # Make directory for output
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    psap_prediction.to_csv(out_dir / f"prediction_{prefix}.csv")
예제 #11
0
# sklearn rf-model, serialize as json

import sklearn_json as skljson
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split

## load data
wine = load_wine()

## split train/tmp3.json
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,
                                                wine.target,
                                                test_size=0.3)

model = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=0)\
    .fit(Xtrain, Ytrain)

## to json
skljson.to_json(model, "rf-model")

## from_json
model2 = skljson.from_json("rf-model")
score = model2.score(Xtest, Ytest)
print(score)
import sklearn_json as skljson
import pickle
deserialized_model = skljson.from_json('lr_model.json')
from sklearn.preprocessing import MinMaxScaler
with open('scaler.json', 'rb') as f:
    scaler = pickle.load(f)

deserialized_model.predict(scaler.transform([[152, 6.5, 8.5, 0.72]]))
예제 #13
0
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X_train)
    vals = np.abs(shap_values[0]).mean(0)
    shap_importance = pd.Series(vals, index=X_train.columns).rename('shap')
    shap_importance.sort_values(ascending=False, inplace=True)
    imp = pd.concat([imp, shap_importance], axis=1)
    imp.to_csv('feature_importnace.csv')

    # Save model
    import sklearn_json as skljson
    file_name = os.path.join('models', 'random_forest.json')
    clf_ser = serialize_random_forest(clf)
    with open(file_name, 'w') as model_json:
        json.dump(clf_ser, model_json)

    deserialized_model = skljson.from_json(file_name)
    deserialized_model.predict(X_test)

    # add to dropbox
    # import dropbox
    # dbx = dropbox.Dropbox('RPqFmEm0LbUAAAAAAAAAAZ5Q4ZbVET-HQh18ixMUp6Gcx5lc0vMYMzMA2rueMjO6')
    # with open(file_name, 'rb') as f:
    #     dbx.files_upload(f.read(), '/trend_labeling/' + file_name, mute = True)

    ### REFIT THE MODEL WITH MOST IMPORTANT FEATURES
    # fi_cols = shap_values['col_name'].head(keep_important_features)
    # X_train_important = X_train[fi_cols]
    # X_test_important = X_test[fi_cols]
    # clf = RandomForestClassifier(criterion='entropy',
    #                         max_features=keep_important_features,
    #                         min_weight_fraction_leaf=min_weight_fraction_leaf,
#pip install sklearn-json

import sklearn_json as skljson

file_name = "abc.json"
skljson.to_json(model, file_name)

deserialized_model = skljson.from_json('abc.json')
예제 #15
0
# read and save model in file

import json
import sklearn_json as skljson
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine

f = json.load(open("tree_model", mode='r', encoding='utf-8'))
# print(f, type(f))
# print(f['meta'])

# ======================================
# 读取json文件的model,使用model
# ======================================
wine = load_wine()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(wine.data,
                                                wine.target,
                                                test_size=0.3)

model = skljson.from_json("tree_model")

print(model.score(Xtrain, Ytrain))
print(model.score(Xtest, Ytest))

print(model.predict(Xtest))
print(Ytest)