Exemplo n.º 1
0
def training_pipeline(text_cols, target_column, vectorizer_params, logistic_params):
    return log_learner_time(
        build_pipeline(
            nlp_logistic_classification_learner(
                text_feature_cols=text_cols,
                target=target_column,
                vectorizer_params=vectorizer_params,
                logistic_params=logistic_params
            )
        ), "tweet_sentiment_analysis")
Exemplo n.º 2
0
def test_nlp_logistic_classification_learner():
    df_train_binary = pd.DataFrame({
        'id': ["id1", "id2", "id3", "id4"],
        'x1': [10.0, 13.0, 10.0, 13.0],
        "text1": ["banana manga", "manga açaí", "banana banana", "Manga."],
        "text2": ["banana mamao", "manga açaí", "banana banana", "Manga."],
        'y': [0, 1, 0, 1]
    })

    df_test_binary = pd.DataFrame({
        'id': ["id4", "id4", "id5", "id6", "id5", "id6"],
        'x1': [0.0, 3.0, 2.0, -13.0, 2.0, -13.0],
        "text1": ["banana manga", "manga açaí", "banana banana", "Manga.", "banana manga", "manga açaí"],
        "text2": ["banana manga", "manga açaí", "jaca banana", "Manga.", "jaca banana", "Manga."],
        'y': [1, 0, 0, 1, 0, 1]
    })

    df_train_multinomial = pd.DataFrame({
        'id': ["id1", "id2", "id3", "id4", "id3", "id4"],
        'x1': [10.0, 13.0, 10.0, 13.0, 10.0, 13.0],
        "text": ["banana manga", "manga açaí", "banana banana", "Manga.", "banana banana", "Manga."],
        'y': [0, 1, 2, 0, 1, 2]
    })

    df_test_multinomial = pd.DataFrame({
        'id': ["id4", "id4", "id5", "id6", "id5", "id6"],
        'x1': [0.0, 3.0, 2.0, -13.0, 2.0, -13.0],
        "text": ["abacaxi manga", "manga açaí", "banana banana", "Abacaxi.", "banana banana", "Manga."],
        'y': [0, 1, 2, 0, 2, 1]
    })

    # test binomial case
    learner_binary = nlp_logistic_classification_learner(text_feature_cols=["text1", "text2"],
                                                         target="y",
                                                         vectorizer_params={"min_df": 1},
                                                         logistic_params=None,
                                                         prediction_column="prediction")

    predict_fn, pred_train, log = learner_binary(df_train_binary)

    pred_test = predict_fn(df_test_binary)

    expected_col_train = df_train_binary.columns.tolist() + ["prediction"]
    expected_col_test = df_test_binary.columns.tolist() + ["prediction"]

    assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
    assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
    assert pred_test.prediction.max() < 1
    assert pred_test.prediction.min() > 0
    assert (pred_test.columns == pred_train.columns).all()

    # test multinomial case
    learner_multinomial = nlp_logistic_classification_learner(text_feature_cols=["text"],
                                                              target="y",
                                                              vectorizer_params={"min_df": 1},
                                                              logistic_params={"multi_class": "multinomial",
                                                                               "solver": "sag",
                                                                               "max_iter": 200},
                                                              prediction_column="prediction")

    predict_fn, pred_train, log = learner_multinomial(df_train_multinomial)

    pred_test = predict_fn(df_test_multinomial)

    expected_col_train = df_train_multinomial.columns.tolist() + ["prediction_0", "prediction_1", "prediction_2"]
    expected_col_test = df_test_multinomial.columns.tolist() + ["prediction_0", "prediction_1", "prediction_2"]
    assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
    assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
    assert (pred_test.columns == pred_train.columns).all()
# Time-based Train/Dev split
logging.info("Perform Train/Dev split")
train, dev = time_split_dataset(data,
                                time_column="input_time",
                                train_start_date="2017-01-01",
                                train_end_date="2018-01-01",
                                holdout_end_date="2019-01-01")
logging.info("Train set (Complaints 2017): {}".format(train.shape))
logging.info("Dev set (Complaints 2018): {}".format(dev.shape))

# Train
logging.info("Start baseline training")
baseline_p_fn, _, baseline_log = nlp_logistic_classification_learner(
    train,
    logistic_params={"solver": "liblinear"},
    text_feature_cols=["text"],
    target="target")
logging.info("Finished baseline training")

logging.info("Start model training")
p_fn, train_pred, log = nlp_logistic_classification_learner(
    train,
    vectorizer_params={
        "strip_accents": "unicode",
        "stop_words": "english"
    },
    logistic_params={
        "solver": "liblinear",
        "class_weight": "balanced"
    },