def test_extract_features_with_wrong_feature_list(): HERE = pathlib.Path(__file__).parents[3] test_df = utils.read_to_df(HERE / 'source/CyberBullying/dataNew.csv') with pytest.raises(ValueError): feature_list = ['wrong', 'feature', 'list'] fe.extract_features(test_df, feature_list)
def test_extract_features_with_bad_file(): with pytest.raises(ValueError): test_df = pd.DataFrame(['not valid'], columns=["unknown"]) feature_list = [ 'post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis' ] fe.extract_features(test_df, feature_list)
def test_feature_extraction(): try: tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') pre.preprocess(tagged_df) assert isinstance(fe.extract_features(tagged_df, FEATURE_LIST), pd.DataFrame) except Exception: pytest.fail('Unexpected error..')
def test_correct_features(): raw_df = pd.DataFrame( [[1, 'מילה עוד מילה חמודה', 0], [2, 'את מטומטמת', 3]], columns=['id', 'text', 'cb_level']) feature_list = [ 'post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis' ] test_df = fe.extract_features(raw_df, feature_list) expected_columns = [ 'id', 'post_length', 'tfidf', 'T1', 'T2', 'T3', 'screamer', 'off_dis', 'not_off_dis' ] if test_df.isnull().values.any(): assert False for col in expected_columns: assert col in test_df.columns assert test_df.iloc[0]['post_length'] == 4 assert test_df.iloc[1]['post_length'] == 2 assert test_df.iloc[0]['tfidf'] > 0 assert test_df.iloc[1]['tfidf'] > 0 assert test_df.iloc[0]['off_dis'] > 0 assert test_df.iloc[1]['off_dis'] > 0 assert test_df.iloc[0]['not_off_dis'] > 0 assert test_df.iloc[1]['not_off_dis'] > 0 assert test_df.iloc[0]['T1'] > 0 assert test_df.iloc[1]['T1'] > 0 assert test_df.iloc[0]['T2'] > 0 assert test_df.iloc[1]['T2'] > 0 assert test_df.iloc[0]['T3'] > 0 assert test_df.iloc[1]['T3'] > 0
def test_train_model(): rf_obj = rf.RandomForest() tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') tagged_df = pre.preprocess(tagged_df) feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] X = fe.extract_features(tagged_df, feature_list) X = X.drop(columns=['id']) y = (tagged_df['cb_level'] == 3).astype(int) rf_obj.train(X, y) assert rf_obj.model is not None
def test_explain_class(): model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') post = 'אני אוהבת אותך' tagged_df = pd.DataFrame({'id': [1], 'text': [post]}) tagged_df = pre.preprocess(tagged_df) feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] X = fe.extract_features(tagged_df, feature_list) X = X.drop(columns=['id']) explanation.explain_class(model, X) assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/force_plot_post.png')
def test_explain_dataset(): model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') tagged_df = pre.preprocess(tagged_df) feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] X = fe.extract_features(tagged_df, feature_list) X = X.drop(columns=['id']) explanation.explain_model(model, X) assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/dependence_plot.png') assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot_bar.png') assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot.png')
def test_explainability(): try: model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') tagged_df = pre.preprocess(tagged_df) X = fe.extract_features(tagged_df, FEATURE_LIST) X = X.drop(columns=['id']) explanation.explain_model(model, X) explanation.explain_class(model, X) except Exception: pytest.fail('Unexpected error..')
def test_classicifation(): try: tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') pre.preprocess(tagged_df) X = fe.extract_features(tagged_df, FEATURE_LIST) y = (tagged_df['cb_level'] == 3).astype(int) X = X.drop(columns=['id']) rf_obj = rf.RandomForest() rf_obj.train(X, y) rf_obj.predict(X) assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') except Exception: pytest.fail('Unexpected error..')
def test_correct_classification(): model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') rf_obj = rf.RandomForest() rf_obj.model = model post = 'אני אוהבת אותך' tagged_df = pd.DataFrame({'id': [1], 'text': [post]}) tagged_df = pre.preprocess(tagged_df) feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] X = fe.extract_features(tagged_df, feature_list) X = X.drop(columns=['id']) y_prob_rf = rf_obj.predict(X) my_prob = y_prob_rf[0] if my_prob is None: assert False assert isinstance(my_prob, numpy.float64)
def test_performance(): model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') rf_obj = rf.RandomForest() rf_obj.model = model df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') df = pre.preprocess(df) X = fe.extract_features(df, FEATURE_LIST) X = X.drop(columns=['id']) y = (df['cb_level'] == 3).astype(int) y_prob_rf = rf_obj.predict(X) pred = np.where(y_prob_rf > 0.5, 1, 0) performance = per.get_performances(y, pred) assert 'f-score' in performance.keys() assert 'recall' in performance.keys() assert 'precision' in performance.keys()
get tagged df """ tagged_df = utils.read_to_df() # Vigo data # tagged_df = utils.create_csv_from_keepers_files() # Keepers data """ Run a pre-processing function on the tagged data """ tagged_df = pre.preprocess(tagged_df) """ Run a extract features function on the clean and tagged data """ feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] fe.folder_name = logger.folder_name X = fe.extract_features(tagged_df, feature_list) logger.write_features(feature_list) y = (tagged_df['cb_level'] == 3).astype(int) X = X.drop(columns=['id']) """ Split data to train and test """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) performances_list = {} auc_list = {} """ Running the baseline model """