def test_combined_models_support_predict_proba_with_more_than_2_classes( self): path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) featurization_pipeline = Pipeline( [OneHotVectorizer(columns={'education': 'education'})]) featurization_pipeline.fit(data) featurized_data = featurization_pipeline.transform(data) feature_cols = ['education', 'age'] training_pipeline = Pipeline([ DatasetTransformer(featurization_pipeline.model), OneVsRestClassifier(LogisticRegressionBinaryClassifier(), feature=feature_cols, label='induced') ]) training_pipeline.fit(data, output_predictor_model=True) concat_pipeline = Pipeline( [PrefixColumnConcatenator({'education': 'education.'})]) concat_pipeline.fit(featurized_data) predictor_pipeline = Pipeline() predictor_pipeline.load_model(training_pipeline.predictor_model) concat_and_predictor_pipeline = Pipeline.combine_models( concat_pipeline, predictor_pipeline) result = concat_and_predictor_pipeline.predict_proba(featurized_data) self.assertEqual(result.shape[1], 3)
def test_experiment_loadsavemodel(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') cat = OneHotVectorizer() << categorical_columns ftree = FastTreesBinaryClassifier() pipeline = Pipeline([cat, ftree]) pipeline.fit(train, label) metrics1, scores1 = pipeline.test(test, label1, 'binary', output_scores=True) sum1 = metrics1.sum().sum() (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin') fl = os.fdopen(fd, 'w') fl.close() pipeline.save_model(modelfilename) pipeline2 = Pipeline() pipeline2.load_model(modelfilename) metrics2, scores2 = pipeline2.test(test, label1, 'binary', output_scores=True) sum2 = metrics2.sum().sum() assert_equal(sum1, sum2, "model metrics don't match after loading model")
def test_notvectorized_output_predictor_model(self): """ This test verifies that outputted predictor model from combined (with featurizers) pipeline runs successfully on featurized data with no vectors. """ df = train_df.drop(['c0'], axis=1) # Create and fit a RangeFilter transform using the training # data and use it to transform the training data. transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'], random_state=seed) transform_pipeline.fit(df) df1 = transform_pipeline.transform(df) # Create and fit a combined model and spit out predictor model combined_pipeline = Pipeline([ RangeFilter(min=0.0, max=4.5) << 'c2', OnlineGradientDescentRegressor(label='c2') ], random_state=seed) combined_pipeline.fit(df, output_predictor_model=True) result_1 = combined_pipeline.predict(df) # Load predictor pipeline and score featurized data predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) result_2 = predictor_pipeline.predict(df1) self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_pipeline_loaded_from_zip_has_feature_contributions(self): features = ['age', 'education-num', 'hours-per-week'] model_nimbusml = Pipeline( steps=[FastLinearBinaryClassifier(feature=features)]) model_nimbusml.fit(train, label) fc = model_nimbusml.get_feature_contributions(test) # Save the model to zip model_filename = get_temp_file(suffix='.zip') model_nimbusml.save_model(model_filename) # Load the model from zip model_nimbusml_zip = Pipeline() model_nimbusml_zip.load_model(model_filename) fc_zip = model_nimbusml_zip.get_feature_contributions(test) assert ['FeatureContributions.' + feature in fc_zip.columns for feature in features] assert [fc['FeatureContributions.' + feature].equals( fc_zip['FeatureContributions.' + feature]) for feature in features] os.remove(model_filename)
def nimbus_pred(model_path, test_set_path): X = pd.read_csv(test_set_path) X['c'] = X['c'].astype("category") p = Pipeline() p.load_model(model_path) pred = p.predict(X) print(pred)
def test_binary_classifier_from_loaded_model(self): model_path = get_temp_model_file() self.binary_model.save_model(model_path) loaded_model = Pipeline() loaded_model.load_model(model_path) pfi_from_loaded = loaded_model.permutation_feature_importance( self.classification_data) assert_frame_equal(self.binary_pfi, pfi_from_loaded) os.remove(model_path)
def test_ranker_from_loaded_model(self): model_path = get_temp_model_file() self.ranker_model.save_model(model_path) loaded_model = Pipeline() loaded_model.load_model(model_path) pfi_from_loaded = loaded_model.permutation_feature_importance( self.ranking_data) assert_frame_equal(self.ranker_pfi, pfi_from_loaded) os.remove(model_path)
def test_model_datastream(self): model_nimbusml = Pipeline( steps=[ ('cat', OneHotVectorizer() << categorical_columns), ('linear', FastLinearBinaryClassifier( shuffle=False, number_of_threads=1))]) model_nimbusml.fit(train, label) # Save with pickle pickle_filename = get_temp_file(suffix='.p') with open(pickle_filename, 'wb') as f: pickle.dump(model_nimbusml, f) with open(pickle_filename, "rb") as f: model_nimbusml_pickle = pickle.load(f) os.remove(pickle_filename) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_pickle.predict(test).head(5) metrics, score = model_nimbusml.test(test, test_label, output_scores=True) metrics_pickle, score_pickle = model_nimbusml_pickle.test( test, test_label, output_scores=True) assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2) assert_almost_equal( metrics.sum().sum(), metrics_pickle.sum().sum(), decimal=2) # Save load with pipeline methods model_filename = get_temp_file(suffix='.m') model_nimbusml.save_model(model_filename) model_nimbusml_load = Pipeline() model_nimbusml_load.load_model(model_filename) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_load.predict(test).head(5) metrics, score = model_nimbusml.test(test, test_label, output_scores=True) model_nimbusml_load, score_load = model_nimbusml_load.test( test, test_label, evaltype='binary', output_scores=True) assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2) assert_almost_equal( metrics.sum().sum(), model_nimbusml_load.sum().sum(), decimal=2) os.remove(model_filename)
def test_pass_predict_proba_from_load_model(selfs): pipeline = Pipeline([LogisticRegressionBinaryClassifier()]) pipeline.fit(X_train, y_train) probs1 = pipeline.predict_proba(X_test) sum1 = probs1.sum().sum() (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin') fl = os.fdopen(fd, 'w') fl.close() pipeline.save_model(modelfilename) pipeline2 = Pipeline() pipeline2.load_model(modelfilename) probs2 = pipeline2.predict_proba(X_test) sum2 = probs2.sum().sum() assert_equal(sum1, sum2, "model probabilities don't match after loading model")
def test_model_datastream(self): model_nimbusml = Pipeline( steps=[('cat', OneHotVectorizer() << categorical_columns), ('linear', FastLinearBinaryClassifier(shuffle=False, train_threads=1) )]) model_nimbusml.fit(train, label) # Save with pickle pickle.dump(model_nimbusml, open('nimbusml_model.p', 'wb')) model_nimbusml_pickle = pickle.load(open("nimbusml_model.p", "rb")) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_pickle.predict(test).head(5) metrics, score = model_nimbusml.test(test, test_label, output_scores=True) metrics_pickle, score_pickle = model_nimbusml_pickle.test( test, test_label, output_scores=True) assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2) assert_almost_equal(metrics.sum().sum(), metrics_pickle.sum().sum(), decimal=2) # Save load with pipeline methods model_nimbusml.save_model('model.nimbusml.m') model_nimbusml_load = Pipeline() model_nimbusml_load.load_model('model.nimbusml.m') score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_load.predict(test).head(5) metrics, score = model_nimbusml.test(test, test_label, output_scores=True) model_nimbusml_load, score_load = model_nimbusml_load.test( test, test_label, evaltype='binary', output_scores=True) assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2) assert_almost_equal(metrics.sum().sum(), model_nimbusml_load.sum().sum(), decimal=2)
def test_combine_two_pipelines_created_from_model_files(self): """ This test verifies that two models can be combined after they are loaded from disk in to new Pipelines. """ # Create and fit a OneHotVectorizer transform using the # training data and use it to transform the training data. transform_pipeline_1 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline_1.fit(train_df) df = transform_pipeline_1.transform(train_df, as_binary_data_stream=True) # Create and fit an OnlineGradientDescentRegressor using # the transformed training data from the previous step. predictor_pipeline_1 = Pipeline( [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])], random_state=seed) predictor_pipeline_1.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline_1.transform(test_df, as_binary_data_stream=True) result_1 = predictor_pipeline_1.predict(df) # Use the model files stored in the Pipelines # to create new Pipelines (aka. create new Pipelines # using the model files stored on disk). transform_pipeline_2 = Pipeline() transform_pipeline_2.load_model(transform_pipeline_1.model) predictor_pipeline_2 = Pipeline() predictor_pipeline_2.load_model(predictor_pipeline_1.model) # Combine the newly created Pipelines in to one Pipeline # and use it to get predictions given the test data. combined_pipeline = Pipeline.combine_models(transform_pipeline_2, predictor_pipeline_2) result_2 = combined_pipeline.predict(test_df) # Verify that the prediction from the combined Pipeline # matches the prediction from the original two Pipelines. self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_vectorized_with_prefixconcat_output_predictor_model(self): """ This test shows how to prepend ColumnConcatenator transform to outputted predictor model from combined (with featurizers) pipeline so it successfully runs on featurized data with vectors. """ # Create and fit a OneHotVectorizer transform using the # training data and use it to transform the training data. transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df) # Create, fit and score with combined model. # Output predictor model separately. combined_pipeline = Pipeline([ OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2') ], random_state=seed) combined_pipeline.fit(train_df, output_predictor_model=True) result_1 = combined_pipeline.predict(train_df) # train ColumnConcatenator on featurized data concat_pipeline = Pipeline( [PrefixColumnConcatenator(columns={'c0': 'c0.'})]) concat_pipeline.fit(df) # Load predictor pipeline predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) # combine concat and predictor models and score combined_predictor_pipeline = Pipeline.combine_models( concat_pipeline, predictor_pipeline) result_2 = combined_predictor_pipeline.predict(df) self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_ensemble_supports_output_predictor_model(self): test2_df = test_df.copy(deep=True) test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}), ignore_index=True) test2_df = test2_df.astype({'c1': np.float32, 'c2': np.float32}) # Create a ground truth pipeline r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1', VotingRegressor(estimators=[r1, r2], combiner='Average')]) combined_pipeline.fit(train_df) result_1 = combined_pipeline.predict(test2_df) # Create a duplicate pipeline but also request a predictor model r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1', VotingRegressor(estimators=[r1, r2], combiner='Average')]) combined_pipeline.fit(train_df, output_predictor_model=True) result_2 = combined_pipeline.predict(test2_df) # Create a predictor model only pipeline predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) result_3 = predictor_pipeline.predict(test2_df) # Verify the first rows are equal self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_2.loc[0, 'Score'], result_3.loc[0, 'Score']) # Verify the second rows are equal self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) self.assertEqual(result_2.loc[1, 'Score'], result_3.loc[1, 'Score']) # Verify the number of rows self.assertEqual(len(result_1), 2) self.assertEqual(len(result_2), 2) self.assertEqual(len(result_3), 4)
def test_vectorized_output_predictor_model(self): """ This test shows that outputted predictor model from combined (with featurizers) pipeline fails to run on featurized data with vectors. """ # Create and fit a OneHotVectorizer transform using the # training data and use it to transform the training data. transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df) # Create and fit a combined model and spit out predictor model combined_pipeline = Pipeline([ OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2') ], random_state=seed) combined_pipeline.fit(train_df, output_predictor_model=True) result_1 = combined_pipeline.predict(train_df) # Load predictor pipeline and score featurized data predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) try: # This does not work because the input schema doesnt # match. Input schema looks for vector 'c0' with slots 'a,b' # but featurized data has only columns 'c0.a' and 'c0.b' predictor_pipeline.predict(df) except Exception as e: pass else: self.fail()
def test_pickled_pipeline_with_predictor_model(self): train_data = {'c1': [1, 2, 3, 4], 'c2': [2, 3, 4, 5]} train_df = pd.DataFrame(train_data).astype({'c1': np.float64, 'c2': np.float64}) test_data = {'c1': [1.5, 2.3, 3.7], 'c2': [2.2, 4.9, 2.7]} test_df = pd.DataFrame(test_data).astype({'c1': np.float64, 'c2': np.float64}) # Create predictor model and use it to predict pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')], random_state=0) pipeline.fit(train_df, output_predictor_model=True) result_1 = pipeline.predict(test_df) self.assertTrue(pipeline.model) self.assertTrue(pipeline.predictor_model) self.assertNotEqual(pipeline.model, pipeline.predictor_model) pickle_filename = get_temp_file(suffix='.p') with open(pickle_filename, 'wb') as f: pickle.dump(pipeline, f) os.remove(pipeline.model) os.remove(pipeline.predictor_model) with open(pickle_filename, "rb") as f: pipeline_pickle = pickle.load(f) os.remove(pickle_filename) # Load predictor pipeline and score data predictor_pipeline = Pipeline() predictor_pipeline.load_model(pipeline_pickle.predictor_model) result_2 = predictor_pipeline.predict(test_df) self.assertTrue(result_1.equals(result_2))
def main(req: func.HttpRequest) -> func.HttpResponse: logging.info('Python HTTP trigger function processed a request.') try: input = req.get_json() logging.info(input) storageaccountobsera1dd = os.environ["storageaccountobsera1dd"] file_store = os.environ['filestore'] blob = BlobClient.from_connection_string( conn_str=storageaccountobsera1dd, container_name="artifacts", blob_name="model.zip") with open(file_store + "/model.zip", "wb") as my_blob: blob_data = blob.download_blob() blob_data.readinto(my_blob) except Exception as ex: logging.info(ex) return func.HttpResponse(f"input not loaded!") output = [] if (input): try: if (len(input["values"]) > 0): for record in input["values"]: observations = [] sentences = record["data"]["sentences"] try: content = json.loads(record["data"]["content"]) except Exception as a: logging.info(a) content = "" logging.info("sentences extraction successful!") try: with open(file_store + '/document.csv', 'w', newline='') as file: writer = csv.writer(file) writer.writerow(["ID\tText"]) i = 0 for sentence in sentences: sentence = sentence.replace("\\n", "").replace( "\\", "") writer.writerow( [str(i + 1) + "\t" + str(sentence)]) logging.info(sentence) i += 1 except: logging.info("document write failed") return func.HttpResponse(f"document write failed") try: modelpath = file_store + '/model.zip' pipeline = Pipeline() pipeline.load_model(modelpath) except Exception as e: logging.info(e) logging.info("model zip didnt load") return func.HttpResponse(f"model zip didnt load") try: datapath = file_store + '/document.csv' data = FileDataStream.read_csv(datapath, sep='\t', header=True) result = pipeline.predict(data) logging.info(result) for x in range(len(result.PredictedLabel)): observations.append({ "sentence": str(x + 1), "score": float(result.Score[x]), "probability": float(result.Probability[x]) }) except Exception as e: logging.info("predict failed") # return func.HttpResponse(f"predict failed") # print(result) try: logging.info("content is here") logging.info(content) if (content != ""): output.append({ "recordId": record["recordId"], "data": { "senderPosition": content.get('senderPosition', ''), "receipients": content.get('receipients', []), "topics": content.get('topics', []), "sendDate": content.get('sendDate', ''), "sender": content.get('sender', ''), "observations": observations, "senderEmail": content.get('senderEmail', ''), "text": content.get('text', content), "subject": content.get('subject', '') }, "errors": None, "warnings": None }) else: output.append({ "recordId": record["recordId"], "data": { "senderPosition": "", "receipients": [], "topics": [], "sendDate": "", "sender": "", "observations": observations, "senderEmail": "", "text": "", "subject": "" }, "errors": None, "warnings": None }) except Exception as d: logging.info(d) logging.info("output formation failed") records = {"values": output} logging.info(records) return func.HttpResponse( json.dumps(records, ensure_ascii=False), status_code=200, headers={"content-type": "application/json"}) except Exception as e: logging.info("sentences extraction failed!") return func.HttpResponse(f"sentences extraction failed!")