示例#1
0
    def test_combined_models_support_predict_proba_with_more_than_2_classes(
            self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(data)

        feature_cols = ['education', 'age']
        training_pipeline = Pipeline([
            DatasetTransformer(featurization_pipeline.model),
            OneVsRestClassifier(LogisticRegressionBinaryClassifier(),
                                feature=feature_cols,
                                label='induced')
        ])
        training_pipeline.fit(data, output_predictor_model=True)

        concat_pipeline = Pipeline(
            [PrefixColumnConcatenator({'education': 'education.'})])
        concat_pipeline.fit(featurized_data)

        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(training_pipeline.predictor_model)

        concat_and_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)

        result = concat_and_predictor_pipeline.predict_proba(featurized_data)
        self.assertEqual(result.shape[1], 3)
示例#2
0
    def test_experiment_loadsavemodel(self):
        (train, label) = get_X_y(train_file, label_column, sep=',')
        (test, label1) = get_X_y(test_file, label_column, sep=',')
        cat = OneHotVectorizer() << categorical_columns
        ftree = FastTreesBinaryClassifier()
        pipeline = Pipeline([cat, ftree])
        pipeline.fit(train, label)
        metrics1, scores1 = pipeline.test(test,
                                          label1,
                                          'binary',
                                          output_scores=True)
        sum1 = metrics1.sum().sum()
        (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin')
        fl = os.fdopen(fd, 'w')
        fl.close()
        pipeline.save_model(modelfilename)

        pipeline2 = Pipeline()
        pipeline2.load_model(modelfilename)
        metrics2, scores2 = pipeline2.test(test,
                                           label1,
                                           'binary',
                                           output_scores=True)
        sum2 = metrics2.sum().sum()

        assert_equal(sum1, sum2,
                     "model metrics don't match after loading model")
    def test_notvectorized_output_predictor_model(self):
        """
        This test verifies that outputted predictor model from 
        combined (with featurizers) pipeline runs successfully
        on featurized data with no vectors.
        """
        df = train_df.drop(['c0'], axis=1)

        # Create and fit a RangeFilter transform using the training
        # data and use it to transform the training data.
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'],
                                      random_state=seed)
        transform_pipeline.fit(df)
        df1 = transform_pipeline.transform(df)

        # Create and fit a combined model and spit out predictor model
        combined_pipeline = Pipeline([
            RangeFilter(min=0.0, max=4.5) << 'c2',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(df)

        # Load predictor pipeline and score featurized data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)
        result_2 = predictor_pipeline.predict(df1)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
示例#4
0
    def test_pipeline_loaded_from_zip_has_feature_contributions(self):
        features = ['age', 'education-num', 'hours-per-week']
        
        model_nimbusml = Pipeline(
            steps=[FastLinearBinaryClassifier(feature=features)])
        model_nimbusml.fit(train, label)
        fc = model_nimbusml.get_feature_contributions(test)

        # Save the model to zip
        model_filename = get_temp_file(suffix='.zip')
        model_nimbusml.save_model(model_filename)
        # Load the model from zip
        model_nimbusml_zip = Pipeline()
        model_nimbusml_zip.load_model(model_filename)

        fc_zip = model_nimbusml_zip.get_feature_contributions(test)
        
        assert ['FeatureContributions.' + feature in fc_zip.columns
                for feature in features]

        assert [fc['FeatureContributions.' + feature].equals(
            fc_zip['FeatureContributions.' + feature])
                for feature in features]

        os.remove(model_filename)
示例#5
0
def nimbus_pred(model_path, test_set_path):
    X = pd.read_csv(test_set_path)
    X['c'] = X['c'].astype("category")
    p = Pipeline()
    p.load_model(model_path)
    pred = p.predict(X)
    print(pred)
 def test_binary_classifier_from_loaded_model(self):
     model_path = get_temp_model_file()
     self.binary_model.save_model(model_path)
     loaded_model = Pipeline()
     loaded_model.load_model(model_path)
     pfi_from_loaded = loaded_model.permutation_feature_importance(
         self.classification_data)
     assert_frame_equal(self.binary_pfi, pfi_from_loaded)
     os.remove(model_path)
 def test_ranker_from_loaded_model(self):
     model_path = get_temp_model_file()
     self.ranker_model.save_model(model_path)
     loaded_model = Pipeline()
     loaded_model.load_model(model_path)
     pfi_from_loaded = loaded_model.permutation_feature_importance(
         self.ranking_data)
     assert_frame_equal(self.ranker_pfi, pfi_from_loaded)
     os.remove(model_path)
示例#8
0
    def test_model_datastream(self):
        model_nimbusml = Pipeline(
            steps=[
                ('cat',
                 OneHotVectorizer() << categorical_columns),
                ('linear',
                 FastLinearBinaryClassifier(
                     shuffle=False,
                     number_of_threads=1))])

        model_nimbusml.fit(train, label)

        # Save with pickle
        pickle_filename = get_temp_file(suffix='.p')
        with open(pickle_filename, 'wb') as f:
            pickle.dump(model_nimbusml, f)

        with open(pickle_filename, "rb") as f:
            model_nimbusml_pickle = pickle.load(f)

        os.remove(pickle_filename)

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_pickle.predict(test).head(5)

        metrics, score = model_nimbusml.test(test, test_label, output_scores=True)
        metrics_pickle, score_pickle = model_nimbusml_pickle.test(
            test, test_label, output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(
            metrics.sum().sum(),
            metrics_pickle.sum().sum(),
            decimal=2)

        # Save load with pipeline methods
        model_filename = get_temp_file(suffix='.m')
        model_nimbusml.save_model(model_filename)
        model_nimbusml_load = Pipeline()
        model_nimbusml_load.load_model(model_filename)

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_load.predict(test).head(5)

        metrics, score = model_nimbusml.test(test, test_label, output_scores=True)
        model_nimbusml_load, score_load = model_nimbusml_load.test(
            test, test_label, evaltype='binary', output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(
            metrics.sum().sum(),
            model_nimbusml_load.sum().sum(),
            decimal=2)

        os.remove(model_filename)
示例#9
0
    def test_pass_predict_proba_from_load_model(selfs):
        pipeline = Pipeline([LogisticRegressionBinaryClassifier()])
        pipeline.fit(X_train, y_train)
        probs1 = pipeline.predict_proba(X_test)
        sum1 = probs1.sum().sum()
        (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin')
        fl = os.fdopen(fd, 'w')
        fl.close()
        pipeline.save_model(modelfilename)

        pipeline2 = Pipeline()
        pipeline2.load_model(modelfilename)
        probs2 = pipeline2.predict_proba(X_test)
        sum2 = probs2.sum().sum()
        assert_equal(sum1, sum2,
                     "model probabilities don't match after loading model")
示例#10
0
    def test_model_datastream(self):
        model_nimbusml = Pipeline(
            steps=[('cat', OneHotVectorizer() << categorical_columns),
                   ('linear',
                    FastLinearBinaryClassifier(shuffle=False, train_threads=1)
                    )])

        model_nimbusml.fit(train, label)

        # Save with pickle
        pickle.dump(model_nimbusml, open('nimbusml_model.p', 'wb'))
        model_nimbusml_pickle = pickle.load(open("nimbusml_model.p", "rb"))

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_pickle.predict(test).head(5)

        metrics, score = model_nimbusml.test(test,
                                             test_label,
                                             output_scores=True)
        metrics_pickle, score_pickle = model_nimbusml_pickle.test(
            test, test_label, output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(metrics.sum().sum(),
                            metrics_pickle.sum().sum(),
                            decimal=2)

        # Save load with pipeline methods
        model_nimbusml.save_model('model.nimbusml.m')
        model_nimbusml_load = Pipeline()
        model_nimbusml_load.load_model('model.nimbusml.m')

        score1 = model_nimbusml.predict(test).head(5)
        score2 = model_nimbusml_load.predict(test).head(5)

        metrics, score = model_nimbusml.test(test,
                                             test_label,
                                             output_scores=True)
        model_nimbusml_load, score_load = model_nimbusml_load.test(
            test, test_label, evaltype='binary', output_scores=True)

        assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2)
        assert_almost_equal(metrics.sum().sum(),
                            model_nimbusml_load.sum().sum(),
                            decimal=2)
示例#11
0
    def test_combine_two_pipelines_created_from_model_files(self):
        """
        This test verifies that two models can be combined
        after they are loaded from disk in to new Pipelines.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline_1 = Pipeline([OneHotVectorizer() << 'c0'],
                                        random_state=seed)
        transform_pipeline_1.fit(train_df)
        df = transform_pipeline_1.transform(train_df,
                                            as_binary_data_stream=True)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline_1 = Pipeline(
            [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])],
            random_state=seed)
        predictor_pipeline_1.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline_1.transform(test_df,
                                            as_binary_data_stream=True)
        result_1 = predictor_pipeline_1.predict(df)

        # Use the model files stored in the Pipelines
        # to create new Pipelines (aka. create new Pipelines
        # using the model files stored on disk).
        transform_pipeline_2 = Pipeline()
        transform_pipeline_2.load_model(transform_pipeline_1.model)
        predictor_pipeline_2 = Pipeline()
        predictor_pipeline_2.load_model(predictor_pipeline_1.model)

        # Combine the newly created Pipelines in to one Pipeline
        # and use it to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline_2,
                                                    predictor_pipeline_2)
        result_2 = combined_pipeline.predict(test_df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
    def test_vectorized_with_prefixconcat_output_predictor_model(self):
        """
        This test shows how to prepend ColumnConcatenator transform
        to outputted predictor model from combined (with featurizers) pipeline
        so it successfully runs on featurized data with vectors.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create, fit and score with combined model.
        # Output predictor model separately.
        combined_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(train_df)

        # train ColumnConcatenator on featurized data
        concat_pipeline = Pipeline(
            [PrefixColumnConcatenator(columns={'c0': 'c0.'})])
        concat_pipeline.fit(df)

        # Load predictor pipeline
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)

        # combine concat and predictor models and score
        combined_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)
        result_2 = combined_predictor_pipeline.predict(df)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
示例#13
0
    def test_ensemble_supports_output_predictor_model(self):
        test2_df = test_df.copy(deep=True)
        test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}),
                                   ignore_index=True)
        test2_df = test2_df.astype({'c1': np.float32, 'c2': np.float32})

        # Create a ground truth pipeline
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1',
                                      VotingRegressor(estimators=[r1, r2], combiner='Average')])
        combined_pipeline.fit(train_df)
        result_1 = combined_pipeline.predict(test2_df)

        # Create a duplicate pipeline but also request a predictor model
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1',
                                      VotingRegressor(estimators=[r1, r2], combiner='Average')])
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_2 = combined_pipeline.predict(test2_df)

        # Create a predictor model only pipeline
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)
        result_3 = predictor_pipeline.predict(test2_df)

        # Verify the first rows are equal
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_2.loc[0, 'Score'], result_3.loc[0, 'Score'])

        # Verify the second rows are equal
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
        self.assertEqual(result_2.loc[1, 'Score'], result_3.loc[1, 'Score'])

        # Verify the number of rows
        self.assertEqual(len(result_1), 2)
        self.assertEqual(len(result_2), 2)
        self.assertEqual(len(result_3), 4)
    def test_vectorized_output_predictor_model(self):
        """
        This test shows that outputted predictor model from 
        combined (with featurizers) pipeline fails to run
        on featurized data with vectors.
        """

        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create and fit a combined model and spit out predictor model
        combined_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(train_df)

        # Load predictor pipeline and score featurized data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)

        try:
            # This does not work because the input schema doesnt
            # match. Input schema looks for vector 'c0' with slots 'a,b'
            # but featurized data has only columns 'c0.a' and 'c0.b'
            predictor_pipeline.predict(df)

        except Exception as e:
            pass
        else:
            self.fail()
示例#15
0
    def test_pickled_pipeline_with_predictor_model(self):
        train_data = {'c1': [1, 2, 3, 4], 'c2': [2, 3, 4, 5]}
        train_df = pd.DataFrame(train_data).astype({'c1': np.float64,
                                                    'c2': np.float64})

        test_data = {'c1': [1.5, 2.3, 3.7], 'c2': [2.2, 4.9, 2.7]}
        test_df = pd.DataFrame(test_data).astype({'c1': np.float64,
                                                  'c2': np.float64})

        # Create predictor model and use it to predict 
        pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')], random_state=0)
        pipeline.fit(train_df, output_predictor_model=True)
        result_1 = pipeline.predict(test_df)

        self.assertTrue(pipeline.model)
        self.assertTrue(pipeline.predictor_model)
        self.assertNotEqual(pipeline.model, pipeline.predictor_model)

        pickle_filename = get_temp_file(suffix='.p')
        with open(pickle_filename, 'wb') as f:
            pickle.dump(pipeline, f)

        os.remove(pipeline.model)
        os.remove(pipeline.predictor_model)

        with open(pickle_filename, "rb") as f:
            pipeline_pickle = pickle.load(f)

        os.remove(pickle_filename)

        # Load predictor pipeline and score data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(pipeline_pickle.predictor_model)
        result_2 = predictor_pipeline.predict(test_df)

        self.assertTrue(result_1.equals(result_2))
示例#16
0
文件: Test.py 项目: naydata/Tester
def main(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Python HTTP trigger function processed a request.')

    try:
        input = req.get_json()
        logging.info(input)
        storageaccountobsera1dd = os.environ["storageaccountobsera1dd"]
        file_store = os.environ['filestore']
        blob = BlobClient.from_connection_string(
            conn_str=storageaccountobsera1dd,
            container_name="artifacts",
            blob_name="model.zip")
        with open(file_store + "/model.zip", "wb") as my_blob:
            blob_data = blob.download_blob()
            blob_data.readinto(my_blob)
    except Exception as ex:
        logging.info(ex)
        return func.HttpResponse(f"input not loaded!")
    output = []
    if (input):
        try:
            if (len(input["values"]) > 0):
                for record in input["values"]:
                    observations = []
                    sentences = record["data"]["sentences"]
                    try:
                        content = json.loads(record["data"]["content"])
                    except Exception as a:
                        logging.info(a)
                        content = ""
                    logging.info("sentences extraction successful!")
                    try:
                        with open(file_store + '/document.csv',
                                  'w',
                                  newline='') as file:
                            writer = csv.writer(file)
                            writer.writerow(["ID\tText"])
                            i = 0
                            for sentence in sentences:
                                sentence = sentence.replace("\\n", "").replace(
                                    "\\", "")
                                writer.writerow(
                                    [str(i + 1) + "\t" + str(sentence)])
                                logging.info(sentence)
                                i += 1
                    except:
                        logging.info("document write failed")
                        return func.HttpResponse(f"document write failed")
                    try:
                        modelpath = file_store + '/model.zip'
                        pipeline = Pipeline()
                        pipeline.load_model(modelpath)
                    except Exception as e:
                        logging.info(e)
                        logging.info("model zip didnt load")
                        return func.HttpResponse(f"model zip didnt load")
                    try:
                        datapath = file_store + '/document.csv'
                        data = FileDataStream.read_csv(datapath,
                                                       sep='\t',
                                                       header=True)
                        result = pipeline.predict(data)
                        logging.info(result)
                        for x in range(len(result.PredictedLabel)):
                            observations.append({
                                "sentence":
                                str(x + 1),
                                "score":
                                float(result.Score[x]),
                                "probability":
                                float(result.Probability[x])
                            })
                    except Exception as e:
                        logging.info("predict failed")
                        # return func.HttpResponse(f"predict failed")
                    # print(result)

                    try:
                        logging.info("content is here")
                        logging.info(content)
                        if (content != ""):
                            output.append({
                                "recordId": record["recordId"],
                                "data": {
                                    "senderPosition":
                                    content.get('senderPosition', ''),
                                    "receipients":
                                    content.get('receipients', []),
                                    "topics":
                                    content.get('topics', []),
                                    "sendDate":
                                    content.get('sendDate', ''),
                                    "sender":
                                    content.get('sender', ''),
                                    "observations":
                                    observations,
                                    "senderEmail":
                                    content.get('senderEmail', ''),
                                    "text":
                                    content.get('text', content),
                                    "subject":
                                    content.get('subject', '')
                                },
                                "errors": None,
                                "warnings": None
                            })
                        else:
                            output.append({
                                "recordId": record["recordId"],
                                "data": {
                                    "senderPosition": "",
                                    "receipients": [],
                                    "topics": [],
                                    "sendDate": "",
                                    "sender": "",
                                    "observations": observations,
                                    "senderEmail": "",
                                    "text": "",
                                    "subject": ""
                                },
                                "errors": None,
                                "warnings": None
                            })
                    except Exception as d:
                        logging.info(d)
                        logging.info("output formation failed")
            records = {"values": output}
            logging.info(records)
            return func.HttpResponse(
                json.dumps(records, ensure_ascii=False),
                status_code=200,
                headers={"content-type": "application/json"})
        except Exception as e:
            logging.info("sentences extraction failed!")
            return func.HttpResponse(f"sentences extraction failed!")