예제 #1
0
def test_read_data_correctly():
    HERE = pathlib.Path(__file__).parents[3]

    data = utils.read_to_df(HERE / 'source/CyberBullying/dataNew.csv')
    assert 'text' in data.columns
    assert 'cb_level' in data.columns
    assert data.shape[0] > 0
예제 #2
0
def test_feature_extraction():
    try:
        tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
        pre.preprocess(tagged_df)
        assert isinstance(fe.extract_features(tagged_df, FEATURE_LIST), pd.DataFrame)
    except Exception:
        pytest.fail('Unexpected error..')
def test_extract_features_with_wrong_feature_list():
    HERE = pathlib.Path(__file__).parents[3]

    test_df = utils.read_to_df(HERE / 'source/CyberBullying/dataNew.csv')
    with pytest.raises(ValueError):
        feature_list = ['wrong', 'feature', 'list']
        fe.extract_features(test_df, feature_list)
def test_train_model():
    rf_obj = rf.RandomForest()
    tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
    tagged_df = pre.preprocess(tagged_df)
    feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
    X = fe.extract_features(tagged_df, feature_list)
    X = X.drop(columns=['id'])
    y = (tagged_df['cb_level'] == 3).astype(int)
    rf_obj.train(X, y)
    assert rf_obj.model is not None
def test_explain_dataset():
    model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
    tagged_df = pre.preprocess(tagged_df)
    feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
    X = fe.extract_features(tagged_df, feature_list)
    X = X.drop(columns=['id'])
    explanation.explain_model(model, X)
    assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/dependence_plot.png')
    assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot_bar.png')
    assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot.png')
예제 #6
0
def test_explainability():
    try:
        model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
        tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
        tagged_df = pre.preprocess(tagged_df)
        X = fe.extract_features(tagged_df, FEATURE_LIST)
        X = X.drop(columns=['id'])
        explanation.explain_model(model, X)
        explanation.explain_class(model, X)
    except Exception:
        pytest.fail('Unexpected error..')
예제 #7
0
def test_classicifation():
    try:
        tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
        pre.preprocess(tagged_df)
        X = fe.extract_features(tagged_df, FEATURE_LIST)
        y = (tagged_df['cb_level'] == 3).astype(int)
        X = X.drop(columns=['id'])
        rf_obj = rf.RandomForest()
        rf_obj.train(X, y)
        rf_obj.predict(X)
        assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    except Exception:
        pytest.fail('Unexpected error..')
예제 #8
0
def test_performance():
    model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    rf_obj = rf.RandomForest()
    rf_obj.model = model
    df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
    df = pre.preprocess(df)
    X = fe.extract_features(df, FEATURE_LIST)
    X = X.drop(columns=['id'])
    y = (df['cb_level'] == 3).astype(int)
    y_prob_rf = rf_obj.predict(X)
    pred = np.where(y_prob_rf > 0.5, 1, 0)
    performance = per.get_performances(y, pred)
    assert 'f-score' in performance.keys()
    assert 'recall' in performance.keys()
    assert 'precision' in performance.keys()
예제 #9
0
"""
Experiment - includes a complete experiment that runs the following models: baseline, XGBoost, Random forest, 
and Naive Bayes.
The experiment saves their results, to choose the best model with the best results.
"""

"""
Creating the logger
"""
logger = Logger.get_logger_instance()

"""
get tagged df
"""
tagged_df = utils.read_to_df()  # Vigo data
# tagged_df = utils.create_csv_from_keepers_files()  # Keepers data

"""
Run a pre-processing function on the tagged data
"""
tagged_df = pre.preprocess(tagged_df)

"""
Run a extract features function on the clean and tagged data
"""
feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
fe.folder_name = logger.folder_name
X = fe.extract_features(tagged_df, feature_list)
logger.write_features(feature_list)
y = (tagged_df['cb_level'] == 3).astype(int)
예제 #10
0
def test_read_data_with_bad_file():
    HERE = pathlib.Path(__file__).parents[1]
    with pytest.raises(ValueError):
        utils.read_to_df(HERE / 'badFile.csv')
예제 #11
0
def test_read_data_from_unknown_path():
    with pytest.raises(FileNotFoundError):
        utils.read_to_df('unknown/path')
예제 #12
0
def test_preprocessing():
    try:
        tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
        assert isinstance(pre.preprocess(tagged_df), pd.DataFrame)
    except Exception:
        pytest.fail('Unexpected error..')