Exemplo n.º 1
0
def test_feature_extraction():
    try:
        tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
        pre.preprocess(tagged_df)
        assert isinstance(fe.extract_features(tagged_df, FEATURE_LIST), pd.DataFrame)
    except Exception:
        pytest.fail('Unexpected error..')
Exemplo n.º 2
0
def test_classicifation():
    try:
        tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
        pre.preprocess(tagged_df)
        X = fe.extract_features(tagged_df, FEATURE_LIST)
        y = (tagged_df['cb_level'] == 3).astype(int)
        X = X.drop(columns=['id'])
        rf_obj = rf.RandomForest()
        rf_obj.train(X, y)
        rf_obj.predict(X)
        assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    except Exception:
        pytest.fail('Unexpected error..')
def test_train_model():
    rf_obj = rf.RandomForest()
    tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
    tagged_df = pre.preprocess(tagged_df)
    feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
    X = fe.extract_features(tagged_df, feature_list)
    X = X.drop(columns=['id'])
    y = (tagged_df['cb_level'] == 3).astype(int)
    rf_obj.train(X, y)
    assert rf_obj.model is not None
Exemplo n.º 4
0
def test_correct_preprocess():
    raw_df = pd.DataFrame([[
        1,
        'מילה, עוד מילה. :) english @#$%^&* word סימן קריאה! שני סימני קריאה!! > אנגלית מחוברenglish',
        0
    ]],
                          columns=['id', 'text', 'cb_level'])
    clean_df = pre.preprocess(raw_df)
    text = clean_df['text'].tolist()[0]
    assert text == 'מילה עוד מילה סימן קריאה! שני סימני קריאה!! אנגלית מחובר '
def test_explain_class():
    model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    post = 'אני אוהבת אותך'
    tagged_df = pd.DataFrame({'id': [1], 'text': [post]})
    tagged_df = pre.preprocess(tagged_df)
    feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
    X = fe.extract_features(tagged_df, feature_list)
    X = X.drop(columns=['id'])
    explanation.explain_class(model, X)
    assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/force_plot_post.png')
def test_explain_dataset():
    model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
    tagged_df = pre.preprocess(tagged_df)
    feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
    X = fe.extract_features(tagged_df, feature_list)
    X = X.drop(columns=['id'])
    explanation.explain_model(model, X)
    assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/dependence_plot.png')
    assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot_bar.png')
    assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot.png')
Exemplo n.º 7
0
def test_explainability():
    try:
        model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
        tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
        tagged_df = pre.preprocess(tagged_df)
        X = fe.extract_features(tagged_df, FEATURE_LIST)
        X = X.drop(columns=['id'])
        explanation.explain_model(model, X)
        explanation.explain_class(model, X)
    except Exception:
        pytest.fail('Unexpected error..')
def test_correct_classification():
    model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    rf_obj = rf.RandomForest()
    rf_obj.model = model
    post = 'אני אוהבת אותך'
    tagged_df = pd.DataFrame({'id': [1], 'text': [post]})
    tagged_df = pre.preprocess(tagged_df)
    feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
    X = fe.extract_features(tagged_df, feature_list)
    X = X.drop(columns=['id'])
    y_prob_rf = rf_obj.predict(X)
    my_prob = y_prob_rf[0]
    if my_prob is None:
        assert False
    assert isinstance(my_prob, numpy.float64)
Exemplo n.º 9
0
def test_performance():
    model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl')
    rf_obj = rf.RandomForest()
    rf_obj.model = model
    df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
    df = pre.preprocess(df)
    X = fe.extract_features(df, FEATURE_LIST)
    X = X.drop(columns=['id'])
    y = (df['cb_level'] == 3).astype(int)
    y_prob_rf = rf_obj.predict(X)
    pred = np.where(y_prob_rf > 0.5, 1, 0)
    performance = per.get_performances(y, pred)
    assert 'f-score' in performance.keys()
    assert 'recall' in performance.keys()
    assert 'precision' in performance.keys()
Exemplo n.º 10
0
"""
Creating the logger
"""
logger = Logger.get_logger_instance()

"""
get tagged df
"""
tagged_df = utils.read_to_df()  # Vigo data
# tagged_df = utils.create_csv_from_keepers_files()  # Keepers data

"""
Run a pre-processing function on the tagged data
"""
tagged_df = pre.preprocess(tagged_df)

"""
Run a extract features function on the clean and tagged data
"""
feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis']
fe.folder_name = logger.folder_name
X = fe.extract_features(tagged_df, feature_list)
logger.write_features(feature_list)
y = (tagged_df['cb_level'] == 3).astype(int)
X = X.drop(columns=['id'])

"""
Split data to train and test
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Exemplo n.º 11
0
def test_empty_dataset():
    with pytest.raises(ValueError):
        df = pd.DataFrame([], columns=['not', 'the', 'right', 'columns'])
        pre.preprocess(df)
Exemplo n.º 12
0
def test_incorrect_column_dataset():
    with pytest.raises(ValueError):
        df = pd.DataFrame([[0, 0, 0, 0]],
                          columns=['not', 'the', 'right', 'columns'])
        pre.preprocess(df)
Exemplo n.º 13
0
def test_preprocessing():
    try:
        tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv')
        assert isinstance(pre.preprocess(tagged_df), pd.DataFrame)
    except Exception:
        pytest.fail('Unexpected error..')