def test_read_data_correctly(): HERE = pathlib.Path(__file__).parents[3] data = utils.read_to_df(HERE / 'source/CyberBullying/dataNew.csv') assert 'text' in data.columns assert 'cb_level' in data.columns assert data.shape[0] > 0
def test_feature_extraction(): try: tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') pre.preprocess(tagged_df) assert isinstance(fe.extract_features(tagged_df, FEATURE_LIST), pd.DataFrame) except Exception: pytest.fail('Unexpected error..')
def test_extract_features_with_wrong_feature_list(): HERE = pathlib.Path(__file__).parents[3] test_df = utils.read_to_df(HERE / 'source/CyberBullying/dataNew.csv') with pytest.raises(ValueError): feature_list = ['wrong', 'feature', 'list'] fe.extract_features(test_df, feature_list)
def test_train_model(): rf_obj = rf.RandomForest() tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') tagged_df = pre.preprocess(tagged_df) feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] X = fe.extract_features(tagged_df, feature_list) X = X.drop(columns=['id']) y = (tagged_df['cb_level'] == 3).astype(int) rf_obj.train(X, y) assert rf_obj.model is not None
def test_explain_dataset(): model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') tagged_df = pre.preprocess(tagged_df) feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] X = fe.extract_features(tagged_df, feature_list) X = X.drop(columns=['id']) explanation.explain_model(model, X) assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/dependence_plot.png') assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot_bar.png') assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/summary_plot.png')
def test_explainability(): try: model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') tagged_df = pre.preprocess(tagged_df) X = fe.extract_features(tagged_df, FEATURE_LIST) X = X.drop(columns=['id']) explanation.explain_model(model, X) explanation.explain_class(model, X) except Exception: pytest.fail('Unexpected error..')
def test_classicifation(): try: tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') pre.preprocess(tagged_df) X = fe.extract_features(tagged_df, FEATURE_LIST) y = (tagged_df['cb_level'] == 3).astype(int) X = X.drop(columns=['id']) rf_obj = rf.RandomForest() rf_obj.train(X, y) rf_obj.predict(X) assert os.path.isfile(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') except Exception: pytest.fail('Unexpected error..')
def test_performance(): model = utils.get_model(ROOT + '/source/CyberBullying/outputs/RandomForest.pkl') rf_obj = rf.RandomForest() rf_obj.model = model df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') df = pre.preprocess(df) X = fe.extract_features(df, FEATURE_LIST) X = X.drop(columns=['id']) y = (df['cb_level'] == 3).astype(int) y_prob_rf = rf_obj.predict(X) pred = np.where(y_prob_rf > 0.5, 1, 0) performance = per.get_performances(y, pred) assert 'f-score' in performance.keys() assert 'recall' in performance.keys() assert 'precision' in performance.keys()
""" Experiment - includes a complete experiment that runs the following models: baseline, XGBoost, Random forest, and Naive Bayes. The experiment saves their results, to choose the best model with the best results. """ """ Creating the logger """ logger = Logger.get_logger_instance() """ get tagged df """ tagged_df = utils.read_to_df() # Vigo data # tagged_df = utils.create_csv_from_keepers_files() # Keepers data """ Run a pre-processing function on the tagged data """ tagged_df = pre.preprocess(tagged_df) """ Run a extract features function on the clean and tagged data """ feature_list = ['post_length', 'tfidf', 'topics', 'screamer', 'words', 'off_dis', 'not_off_dis'] fe.folder_name = logger.folder_name X = fe.extract_features(tagged_df, feature_list) logger.write_features(feature_list) y = (tagged_df['cb_level'] == 3).astype(int)
def test_read_data_with_bad_file(): HERE = pathlib.Path(__file__).parents[1] with pytest.raises(ValueError): utils.read_to_df(HERE / 'badFile.csv')
def test_read_data_from_unknown_path(): with pytest.raises(FileNotFoundError): utils.read_to_df('unknown/path')
def test_preprocessing(): try: tagged_df = utils.read_to_df(ROOT + '/source/CyberBullying/dataNew.csv') assert isinstance(pre.preprocess(tagged_df), pd.DataFrame) except Exception: pytest.fail('Unexpected error..')