Python load_dataset示例，classification_model.processing.data_management.load_dataset Python示例

示例#1

0

显示文件

文件： test_controller.py 项目： lok63/Text-Classification-

def test_prediction_endpoint_returns_prediction(flask_test_client):
    # Given
    # Load the test data from the regression_model package
    # This is important as it makes it harder for the test
    # data versions to get confused by not spreading it
    # across packages.
    test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)

    post_json = test_data[0:1].to_json(orient='records')

    print("########################################")
    print(post_json)

    # When
    response = flask_test_client.post('/v1/predict/classification',
                                      json=post_json)

    # Then
    assert response.status_code == 200
    response_json = json.loads(response.data)
    prediction = response_json['predictions']
    response_version = response_json['version']

    assert math.ceil(prediction) == 0
    assert response_version == _version

示例#2

0

显示文件

def test_model_for_differential(*, save_file='test_data_predictions.csv'):
    previous_model_df = pd.read_csv(f'{api_config.PACKAGE_ROOT}/{save_file}')

    previous_model_predictions = previous_model_df.predictions.values
    print('previous predictions:', previous_model_predictions)

    test_data = load_dataset(file_name=config.TESTING_DATA_FILE)

    test_data.drop('id', axis=1, inplace=True)
    test_data[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
              config.DISCRETE_SET3_FEATURES] = test_data[
                  config.DISCRETE_SET1_FEATURES +
                  config.DISCRETE_SET2_FEATURES +
                  config.DISCRETE_SET3_FEATURES].astype(str)

    multiple_test_input = test_data[0:200]

    current_result = make_prediction(input_data=multiple_test_input)
    current_model_predictions = current_result.get('predictions')
    print('current predictions:', current_model_predictions)

    assert len(previous_model_predictions) == len(current_model_predictions)

    for previous_value, current_value in zip(previous_model_predictions,
                                             current_model_predictions):
        previous_value = previous_value.item()
        current_value = current_value.item()

        assert math.isclose(previous_value, current_value, rel_tol=1)

示例#3

0

显示文件

def run_training():
    """Train the model."""

    # read training data
    data = load_dataset(config.TRAINING_DATA_FILE)
    print("------------")
    # select only the customer conversations
    data = data[data["message_source"] == "customer"][["message","case_type"]]


    # divide train and test
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=config.SEED)

    X = data.drop(config.TARGET, axis=1)
    y = data[config.TARGET].apply(lambda x: 0 if x=="cancel_order" else 1)


    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]


    X_train = X_train[config.FEATURES]
    X_test = X_test[config.FEATURES]

    print(X_train.iloc[0])


    pipeline.full_pipe.fit(X_train, y_train)

    save_pipeline(pipeline_to_persist=pipeline.full_pipe)

示例#4

0

显示文件

def test_make_single_prediction():
    # Given
    test_data = load_dataset(file_name='test.csv')
    single_test_input = test_data[0:1]

    # When
    subject = make_prediction(input_data=single_test_input)
    S1 = subject.get('predictions')[0]

    # Then
    assert subject is not None
    assert isinstance(S1, np.int64)
    assert math.ceil(subject.get('predictions')[0]) == 0

示例#5

0

显示文件

def capture_predictions(*, save_file:str = 'test_data_predictions.csv')	:
	test_data = load_dataset(file_name='test.csv')

	test_data.drop('id', axis=1, inplace=True)
	test_data[config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+config.DISCRETE_SET3_FEATURES]=test_data[config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+config.DISCRETE_SET3_FEATURES].astype(str)

	multiple_test_json = test_data[0:200]

	predictions = make_prediction(input_data=multiple_test_json)

	predictions_df = pd.DataFrame(predictions)

	predictions_df.to_csv(f'{api_config.PACKAGE_ROOT}/{save_file}')

示例#6

0

显示文件

文件： test_predict.py 项目： amreshd04/model-build-deploy-CI-CD-using-CircleCI-to-EC2

def test_single_prediction():
    test_data = load_dataset(file_name=config.TESTING_DATA_FILE)
    test_data[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
              config.DISCRETE_SET3_FEATURES] = test_data[
                  config.DISCRETE_SET1_FEATURES +
                  config.DISCRETE_SET2_FEATURES +
                  config.DISCRETE_SET3_FEATURES].astype(str)
    single_test_input = test_data[0:1]

    subject = make_prediction(input_data=single_test_input[config.FEATURES])

    assert subject is not None
    assert isinstance(subject.get('predictions')[0], np.int64)
    assert math.ceil(subject.get('predictions')[0] == 0)

示例#7

0

显示文件

def test_make_single_prediction():
    # Given
    test_data = load_dataset(file_name='test.csv')
    # print(test_data)
    single_test_json = test_data[0:1].to_json(orient='records')
    # print(single_test_json)
    # When
    subject = make_prediction(input_data=single_test_json)
    #print(subject)
    # Then
    print(type(subject.get('predictions')[0]))
    assert subject is not None
    #assert isinstance(subject.get('predictions')[0], 0)
    assert math.ceil(subject.get('predictions')[0]) == 0

示例#8

0

显示文件

def test_make_multiple_predictions():
    # Given
    test_data = load_dataset(file_name='test.csv')
    original_data_length = len(test_data)
    multiple_test_input = test_data

    # When
    subject = make_prediction(input_data=multiple_test_input)

    # Then
    assert subject is not None
    assert len(subject.get('predictions')) == 417

    # We expect some rows to be filtered out
    assert len(subject.get('predictions')) != original_data_length

示例#9

0

显示文件

文件： train_pipeline.py 项目： AntonisCSt/Deploy-Machine-Learning-Pipeline

def run_training() -> None:
    """Train the model."""
    # read training data
    data = load_dataset(file_name=config.TRAINING_DATA_FILE)
    # divide train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(config.TARGET, axis=1),
        data[config.TARGET],
        test_size=0.2,
        random_state=config.RANDOM_SEED)  # we are setting the seed here
    # fit pipeline
    pipeline.titanic_pipe.fit(X_train, y_train)

    _logger.info(f"saving model version: {_version}")
    # save pipeline
    save_pipeline(pipeline_to_persist=pipeline.titanic_pipe)

示例#10

0

显示文件

def test_make_multiple_predictions():
    # Given
    test_data = load_dataset(file_name='test.csv')
    original_data_length = len(test_data)
    multiple_test_json = test_data.to_json(orient='records')

    # When
    subject = make_prediction(input_data=multiple_test_json)

    # Then
    assert subject is not None
    #assert len(subject.get('predictions')) == 1451

    # We expect some rows to be filtered out

# assert len(subject.get('predictions')) != original_data_length

示例#11

0

显示文件

文件： test_controller.py 项目： CIAI-RnD-Team/Titanic_ML_Model

def test_prediction_endpoint_returns_prediction(flask_test_client):
    # Given
    # Load the test data from the classification_model package
    test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
    post_json = test_data[0:1].to_json(orient='records')

    # When
    response = flask_test_client.post('/v1/predict/classification',
                                      json=json.loads(post_json))

    # Then
    assert response.status_code == 200
    response_json = json.loads(response.data)
    prediction = response_json['predictions']
    response_version = response_json['version']
    assert math.ceil(prediction[0]) == 0
    assert response_version == _version

示例#12

0

显示文件

文件： capture_model_predictions.py 项目： CIAI-RnD-Team/Titanic_ML_Model

def capture_predictions() -> None:
    """Save the test data predictions to a CSV."""

    save_file = 'test_data_predictions.csv'
    test_data = load_dataset(file_name='test.csv')

    # we take a slice with no input validation issues
    multiple_test_input = test_data[99:600]

    predictions = make_prediction(input_data=multiple_test_input)

    # save predictions for the test dataset
    predictions_df = pd.DataFrame(predictions)

    # hack here to save the file to the classification model
    # package of the repo, not the installed package
    predictions_df.to_csv(f'{config.PACKAGE_ROOT}/{save_file}')

示例#13

0

显示文件

文件： test_predict.py 项目： lok63/Text-Classification-

def test_make_single_prediction():
    # Given
    test_data = load_dataset(file_name='test.csv')
    single_test_json = test_data[0:1].to_json(
        orient='records')  # Get a single instance

    # When
    subject = make_prediction(
        input_data=single_test_json)  #Call the clf to make a prediction

    # Then
    assert subject is not None  #assert the prediction is not empty
    assert isinstance(
        subject.get('predictions')[0],
        np.int64)  #ensure the preduction returns either 0,1 ->int64
    assert math.ceil(subject.get('predictions')
                     [0]) == 0  # We now that the first row preidction is 0

示例#14

0

显示文件

def test_prediction_endpoint_validation_200(flask_test_client):
    # Given
    # Load the test data from the classification_model package.
    test_data = load_dataset(file_name=config.TESTING_DATA_FILE)
    post_json = test_data.to_json(orient='records')

    # When
    response = flask_test_client.post('/v1/predict/classification',
                                      json=json.loads(post_json))

    # Then
    assert response.status_code == 200
    response_json = json.loads(response.data)

    # Check correct number of errors removed
    assert len(response_json.get('predictions')) + len(
        response_json.get('errors')) == len(test_data)

示例#15

0

显示文件

文件： train_pipeline.py 项目： CIAI-RnD-Team/Titanic_ML_Model

def run_training():
    """Train the model."""

    # read training data
    data = load_dataset(file_name=config.TRAINING_DATA_FILE)

    # divide train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(config.TARGET, axis=1),
        data[config.TARGET],
        test_size=0.2,
        random_state=0)  # we are setting the seed here

    pipeline.titanic_pipe.fit(X_train, y_train)
    joblib.dump(pipeline.titanic_pipe, config.PIPELINE_NAME)

    _logger.info(f"saving model version: {_version}")
    save_pipeline(pipeline_to_persist=pipeline.titanic_pipe)

示例#16

0

显示文件

文件： test_predict.py 项目： lok63/Text-Classification-

def test_make_multiple_predictions():
    # Given
    test_data = load_dataset(file_name='test.csv')
    original_data_length = len(test_data)
    multiple_test_json = test_data.to_json(orient='records')

    true_predictions = [0, 1, 0, 1]

    print("#####################")
    print(multiple_test_json)

    # When
    subject = make_prediction(input_data=multiple_test_json)

    # Then
    assert subject is not None
    assert len(subject.get('predictions')) == 4
    for i, pred in enumerate(subject.get('predictions')):
        print(i)
        assert pred == true_predictions[i]

示例#17

0

显示文件

文件： test_validation.py 项目： amreshd04/model-build-deploy-CI-CD-using-CircleCI-to-EC2

def test_prediction_endpoint(flask_test_client):
    # Given
    test_data = load_dataset(file_name=config.TESTING_DATA_FILE)
    test_data[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
              config.DISCRETE_SET3_FEATURES] = test_data[
                  config.DISCRETE_SET1_FEATURES +
                  config.DISCRETE_SET2_FEATURES +
                  config.DISCRETE_SET3_FEATURES].astype(str)
    post_json = test_data[config.FEATURES].to_json(orient='records')

    input_data_sent_to_model = json.loads(post_json)[0:500]
    # When
    response = flask_test_client.post('/v1/predict/classification',
                                      json=input_data_sent_to_model)

    # Then
    assert response.status_code == 200
    assert len(input_data_sent_to_model) == len(
        json.loads(response.data)['predictions'])
    assert json.loads(response.data)['errors'] == None

示例#18

0

显示文件

def test_model_prediction_differential(
        *,
        save_file: str = 'test_data_predictions.csv'):
    """
    This test compares the prediction result similarity of
    the current model with the previous model's results.
    """

    # Given
    # Load the saved previous model predictions
    previous_model_df = pd.read_csv(f'{config.PACKAGE_ROOT}/{save_file}')
    previous_model_predictions = previous_model_df.predictions.values

    test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
    multiple_test_input = test_data[99:600]

    # When
    current_result = make_prediction(input_data=multiple_test_input)
    current_model_predictions = current_result.get('predictions')

    # Then
    # diff the current model vs. the old model
    assert len(previous_model_predictions) == len(
        current_model_predictions)

    # Perform the differential test
    for previous_value, current_value in zip(
            previous_model_predictions, current_model_predictions):

        # convert numpy float64 to Python float.
        previous_value = previous_value.item()
        current_value = current_value.item()

        # rel_tol is the relative tolerance – it is the maximum allowed
        # difference between a and b, relative to the larger absolute
        # value of a or b. For example, to set a tolerance of 5%, pass
        # rel_tol=0.05.
        assert math.isclose(previous_value,
                            current_value,
                            rel_tol=model_config.ACCEPTABLE_MODEL_DIFFERENCE)

示例#19

0

显示文件

文件： test_predict.py 项目： amreshd04/model-build-deploy-CI-CD-using-CircleCI-to-EC2

def test_multiple_predictions():
    # Given
    test_data = load_dataset(file_name=config.TESTING_DATA_FILE)
    test_data.drop('id', axis=1, inplace=True)
    test_data[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
              config.DISCRETE_SET3_FEATURES] = test_data[
                  config.DISCRETE_SET1_FEATURES +
                  config.DISCRETE_SET2_FEATURES +
                  config.DISCRETE_SET3_FEATURES].astype(str)

    original_length = len(test_data)
    multiple_test_input = test_data

    # When
    subject = make_prediction(input_data=multiple_test_input)

    # Then
    assert subject is not None
    #print(multiple_test_input)
    #print(original_length)
    #print(subject)
    assert len(subject.get('predictions')) == 127037

示例#20

0

显示文件

文件： train_pipeline.py 项目： amreshd04/model-build-deploy-CI-CD-using-CircleCI-to-EC2

def run_training() -> None:

    data = load_dataset(file_name=config.TRAINING_DATA_FILE)

    X_train, X_test, y_train, y_test = train_test_split(data[config.FEATURES],
                                                        data[config.TARGET],
                                                        test_size=0.1,
                                                        random_state=0)

    X_train[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
            config.DISCRETE_SET3_FEATURES] = X_train[
                config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
                config.DISCRETE_SET3_FEATURES].astype(str)

    X_test[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
           config.DISCRETE_SET3_FEATURES] = X_test[
               config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
               config.DISCRETE_SET3_FEATURES].astype(str)

    pipeline.rf_pipe.fit(X_train[config.FEATURES], y_train)

    _logger.info(f"saving model version: {_version}")
    save_pipeline(pipeline_to_persist=pipeline.rf_pipe)

示例#21

0

显示文件

def test_prediction_endpoint(flask_test_client):
    # Given
    test_data = load_dataset(file_name=config.TESTING_DATA_FILE)
    test_data[config.DISCRETE_SET1_FEATURES + config.DISCRETE_SET2_FEATURES +
              config.DISCRETE_SET3_FEATURES] = test_data[
                  config.DISCRETE_SET1_FEATURES +
                  config.DISCRETE_SET2_FEATURES +
                  config.DISCRETE_SET3_FEATURES].astype(str)
    post_json = test_data[config.FEATURES].to_json(orient='records')

    input_data_sent_to_model = json.loads(post_json)[0:1]

    # When
    response = flask_test_client.post('/v1/predict/classification',
                                      json=input_data_sent_to_model)

    response_json = json.loads(response.data)
    prediction = response_json['predictions']
    response_version = response_json['version']

    # Then
    assert response.status_code == 200
    assert math.ceil(prediction[0]) == 0
    assert response_version == _version