def test_extract_features_with_wrong_feature_list():
    HERE = pathlib.Path(__file__).parents[3]

    test_df = utils.read_to_df(HERE / 'source/CyberBullying/dataNew.csv')
    with pytest.raises(ValueError):
        feature_list = ['wrong', 'feature', 'list']
        fe.extract_features(test_df, feature_list)
def test_extract_features_with_bad_file():
    with pytest.raises(ValueError):
        test_df = pd.DataFrame(['not valid'], columns=["unknown"])
        feature_list = [
            'post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis',
            'not_off_dis'
        ]
        fe.extract_features(test_df, feature_list)
예제 #3
0
def test_feature_extraction():
    try:
        tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
        pre.preprocess(tagged_df)
        assert isinstance(fe.extract_features(tagged_df, FEATURE_LIST), pd.DataFrame)
    except Exception:
        pytest.fail('Unexpected error..')
def test_correct_features():
    raw_df = pd.DataFrame(
        [[1, 'מילה עוד מילה חמודה', 0], [2, 'את מטומטמת', 3]],
        columns=['id', 'text', 'cb_level'])
    feature_list = [
        'post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis',
        'not_off_dis'
    ]
    test_df = fe.extract_features(raw_df, feature_list)
    expected_columns = [
        'id', 'post_length', 'tfidf', 'T1', 'T2', 'T3', 'screamer', 'off_dis',
        'not_off_dis'
    ]
    if test_df.isnull().values.any():
        assert False
    for col in expected_columns:
        assert col in test_df.columns
    assert test_df.iloc[0]['post_length'] == 4
    assert test_df.iloc[1]['post_length'] == 2
    assert test_df.iloc[0]['tfidf'] > 0
    assert test_df.iloc[1]['tfidf'] > 0
    assert test_df.iloc[0]['off_dis'] > 0
    assert test_df.iloc[1]['off_dis'] > 0
    assert test_df.iloc[0]['not_off_dis'] > 0
    assert test_df.iloc[1]['not_off_dis'] > 0
    assert test_df.iloc[0]['T1'] > 0
    assert test_df.iloc[1]['T1'] > 0
    assert test_df.iloc[0]['T2'] > 0
    assert test_df.iloc[1]['T2'] > 0
    assert test_df.iloc[0]['T3'] > 0
    assert test_df.iloc[1]['T3'] > 0
def test_train_model():
    rf_obj = rf.RandomForest()
    tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
    tagged_df = pre.preprocess(tagged_df)
    feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
    X = fe.extract_features(tagged_df, feature_list)
    X = X.drop(columns=['id'])
    y = (tagged_df['cb_level'] == 3).astype(int)
    rf_obj.train(X, y)
    assert rf_obj.model is not None
def test_explain_class():
    model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    post = 'אני אוהבת אותך'
    tagged_df = pd.DataFrame({'id': [1], 'text': [post]})
    tagged_df = pre.preprocess(tagged_df)
    feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
    X = fe.extract_features(tagged_df, feature_list)
    X = X.drop(columns=['id'])
    explanation.explain_class(model, X)
    assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/force_plot_post.png')
def test_explain_dataset():
    model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
    tagged_df = pre.preprocess(tagged_df)
    feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
    X = fe.extract_features(tagged_df, feature_list)
    X = X.drop(columns=['id'])
    explanation.explain_model(model, X)
    assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/dependence_plot.png')
    assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot_bar.png')
    assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot.png')
예제 #8
0
def test_explainability():
    try:
        model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
        tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
        tagged_df = pre.preprocess(tagged_df)
        X = fe.extract_features(tagged_df, FEATURE_LIST)
        X = X.drop(columns=['id'])
        explanation.explain_model(model, X)
        explanation.explain_class(model, X)
    except Exception:
        pytest.fail('Unexpected error..')
예제 #9
0
def test_classicifation():
    try:
        tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
        pre.preprocess(tagged_df)
        X = fe.extract_features(tagged_df, FEATURE_LIST)
        y = (tagged_df['cb_level'] == 3).astype(int)
        X = X.drop(columns=['id'])
        rf_obj = rf.RandomForest()
        rf_obj.train(X, y)
        rf_obj.predict(X)
        assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    except Exception:
        pytest.fail('Unexpected error..')
def test_correct_classification():
    model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    rf_obj = rf.RandomForest()
    rf_obj.model = model
    post = 'אני אוהבת אותך'
    tagged_df = pd.DataFrame({'id': [1], 'text': [post]})
    tagged_df = pre.preprocess(tagged_df)
    feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
    X = fe.extract_features(tagged_df, feature_list)
    X = X.drop(columns=['id'])
    y_prob_rf = rf_obj.predict(X)
    my_prob = y_prob_rf[0]
    if my_prob is None:
        assert False
    assert isinstance(my_prob, numpy.float64)
예제 #11
0
def test_performance():
    model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    rf_obj = rf.RandomForest()
    rf_obj.model = model
    df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
    df = pre.preprocess(df)
    X = fe.extract_features(df, FEATURE_LIST)
    X = X.drop(columns=['id'])
    y = (df['cb_level'] == 3).astype(int)
    y_prob_rf = rf_obj.predict(X)
    pred = np.where(y_prob_rf > 0.5, 1, 0)
    performance = per.get_performances(y, pred)
    assert 'f-score' in performance.keys()
    assert 'recall' in performance.keys()
    assert 'precision' in performance.keys()
예제 #12
0
get tagged df
"""
tagged_df = utils.read_to_df()  # Vigo data
# tagged_df = utils.create_csv_from_keepers_files()  # Keepers data

"""
Run a pre-processing function on the tagged data
"""
tagged_df = pre.preprocess(tagged_df)

"""
Run a extract features function on the clean and tagged data
"""
feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
fe.folder_name = logger.folder_name
X = fe.extract_features(tagged_df, feature_list)
logger.write_features(feature_list)
y = (tagged_df['cb_level'] == 3).astype(int)
X = X.drop(columns=['id'])

"""
Split data to train and test
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

performances_list = {}
auc_list = {}

"""
Running the baseline model
"""