def run_feature_extraction_create_corpus(run_from_scratch, df_preprocessed): """ Run corpus building if run_from_scratch=True """ if run_from_scratch: df_corpus = build_corpus(df_preprocessed) df_corpus.to_csv( str(get_project_root()) + "/data/extracted_features/corpus.csv") return df_corpus else: df_corpus = pd.read_csv( str(get_project_root()) + "/data/extracted_features/corpus.csv") return df_corpus
def run_preprocessing(run_from_scratch): """ Run data preprocessing if run_from_scratch=True """ if run_from_scratch: # prepare corpus print("\nPreparing data ...") prepare_and_merge_datasets() df_preprocessed = pd.read_csv(str(get_project_root()) + "/data/preprocessed/dataset.csv", index_col=0) return df_preprocessed else: df_preprocessed = pd.read_csv(str(get_project_root()) + "/data/preprocessed/dataset.csv", index_col=0) return df_preprocessed
def run_feature_extraction(run_from_scratch, df_corpus): """ Run feature extraction if run_from_scratch=True """ if run_from_scratch: print("\nExtracting features ...") df_extracted_features = FeatureExtractor( df_corpus).get_df_with_all_features() df_extracted_features = df_extracted_features.drop( ["original_content", "content", "tokens", "pos", "stems"], axis=1) df_extracted_features.to_csv( str(get_project_root()) + "/data/extracted_features/extracted_features.csv") return df_extracted_features else: df_extracted_features = pd.read_csv( str(get_project_root()) + "/data/extracted_features/extracted_features.csv") return df_extracted_features
def train_fasttext(self, path_to_csv_dataset_file): """Train fasttext model based on dataset Parameters: - path_to_csv_dataset_file: relative path to the dataset file (expects a csv file) """ model = fasttext.train_unsupervised(path_to_csv_dataset_file, model="skipgram") model.save_model( str(get_project_root()) + "/models/fasttext_model.bin")
def _prepare_hate_speech_dataset(): df_dataset = _create_df_and_drop_columns( str(get_project_root()) + "/data/original/hate-speech-dataset/annotations_metadata.csv", None, ["user_id", "subforum_id", "num_contexts"], ) df_dataset = _filter_and_format_hate_speech(df_dataset) return df_dataset
def prepare_and_merge_datasets(include_offensive_language=False): """ Prepares and merges the datasets """ dataset_csv = open( str(get_project_root()) + "/data/preprocessed/dataset.csv", encoding="utf-8", mode="w", ) dataset_copy = open( str(get_project_root()) + "/analysis/dataset.csv", encoding="utf-8", mode="w" ) if include_offensive_language: df_dataset = _prepare_hate_speech_and_offensive_language(True) else: df_first_dataset = _prepare_hate_speech_and_offensive_language() df_second_dataset = _prepare_hate_speech_dataset() df_dataset = pd.concat([df_first_dataset, df_second_dataset], ignore_index=True) dataset_csv.write(df_dataset.to_csv()) dataset_csv.close() dataset_copy.write(df_dataset.to_csv()) dataset_copy.close()
def _prepare_hate_speech_and_offensive_language(include_offensive_language=False): df_dataset = _create_df_and_drop_columns( str(get_project_root()) + "/data/original/hate-speech-and-offensive-language/labeled_data.csv", 0, ["count", "hate_speech", "offensive_language", "neither"], ) if include_offensive_language: df_dataset.rename(columns={"tweet": "content"}, inplace=True) else: df_dataset = _filter_and_format_hate_speech_and_offensive_language(df_dataset) # df_dataset = _data_preparation(df_dataset) return df_dataset
def visualize_special_characters(self, df): """Visualizes the number of special characters as bar plot Parameters: df: dataframe with the extracted features for special characters Return: stores barplots in analysis folder """ df_hate_speech = df[df["class"] == 0] df_neutral_speech = df[df["class"] == 1] for character in self.list_of_special_characters: hate_bincount = self._calculate_bincount_of_special_character( df_hate_speech, character ) neutral_bincount = self._calculate_bincount_of_special_character( df_neutral_speech, character ) hate_bincount_summarized = self._summarize_bincount_data(hate_bincount) neutral_bincount_summarized = self._summarize_bincount_data( neutral_bincount ) x = np.arange(11) plt.bar(x + 0.0, hate_bincount_summarized, color="r", width=0.2) plt.bar(x + 0.2, neutral_bincount_summarized, color="b", width=0.2) x_ticks = [str(x) for x in range(10)] x_ticks.append(">10") plt.xticks(x, x_ticks) plt.title( "Number of data instances with number of " + self.list_of_special_characters[character] ) plt.xlabel("Number of " + character) plt.ylabel("Number of data instances") plt.legend(["hate speech", "neutral speech"]) plt.savefig( str(get_project_root()) + "/analysis/features/semantic/barchart_special_character_" + character )
def visualize_word_embeddings_with_tsne(self, model): """Creates an TSNE model and plots it Parameters: - model: fasttext model """ labels = [] tokens = [] for word in model.words: tokens.append(model[word]) labels.append(word) tsne_model = TSNE(perplexity=40, n_components=2, init="pca", n_iter=2500, random_state=23) new_values = tsne_model.fit_transform(tokens) x = [] y = [] for value in new_values: x.append(value[0]) y.append(value[1]) plt.figure(figsize=(16, 16)) for index, x in enumerate(x): plt.scatter(x[index], y[index]) plt.annotate( labels[index], xy=(x[index], y[index]), xytext=(5, 2), textcoords="offset points", ha="right", va="bottom", ) plt.savefig( str(get_project_root()) + "/analysis/fasttext_tsne_visualization") plt.show()
def _filter_and_format_hate_speech(df): df.loc[df["label"] == "hate", "label"] = 0 df.loc[df["label"] == "noHate", "label"] = 1 df.rename(columns={"label": "class"}, inplace=True) content = [] for i, row in df.iterrows(): if row["class"] == "idk/skip" or row["class"] == "relation": df = df.drop(index=i) continue file = open( str(get_project_root()) + "/data/original/hate-speech-dataset/all_files/{}.txt".format( row["file_id"] ), encoding="utf-8", mode="r", ) content.append(file.read()) file.close() df["content"] = content df.drop(["file_id"], axis=1, inplace=True) return df
def extract_features(self, df, visualize=False): """Extract vector representation of the data instance based on word embeddings trained by fasttext Parameters: df with the column containing the tokens of each data instance Return: passed df with new feature column containing a vector (mean of the word embeddings of all tokens) """ model = fasttext.load_model( str(get_project_root()) + "/models/fasttext_model.bin") df_fasttext_vector = pd.DataFrame() df_fasttext_vector["fasttext_word_embeddings_vector"] = df[ "tokens"].apply( lambda cell: self.get_vector_of_data_instance(model, cell)) df_fasttext_vector = pd.DataFrame( df_fasttext_vector["fasttext_word_embeddings_vector"].values. tolist()) titles = [ "fasttext_word_embeddings_vector_" + str(i) for i in range(100) ] df_fasttext_vector.columns = titles df = pd.concat([df_fasttext_vector, df], axis=1) if visualize: self.visualize_word_embeddings_with_tsne(model) return df
def __init__(self): self.train_fasttext( str(get_project_root()) + "/data/preprocessed/dataset.csv")
df_preprocessed_data = run_preprocessing(preprocessing) df_data_corpus = run_feature_extraction_create_corpus( corpus, df_preprocessed_data) df_data_extracted_features = run_feature_extraction( feature_extraction, df_data_corpus) # unchanged dataset raw_text_features = df_preprocessed_data["content"] raw_text_labels = df_preprocessed_data["class"] extracted_features = df_data_extracted_features.loc[:, df_data_extracted_features .columns != "class"] labels = df_data_extracted_features["class"] # do balancing, i.e. over- and undersampling input_data = InputData(raw_text_features, raw_text_labels, extracted_features, labels) # feature importances print("\nFeature importances ...") feature_importance = FeatureImportance(extracted_features, labels, extracted_features.columns.values) feature_importance.get_importance_scores() # run classifiers print("\nRunning classifiers ...") classifier_executor = ClassifierExecutor(input_data.get_datasets()) df_results = classifier_executor.get_results() df_results.to_csv(str(get_project_root()) + "/results/results.csv")
def test_get_project_root(self): project_root_path = get_project_root() path_components = str(project_root_path).split("/") self.assertEqual("src", path_components[len(path_components) - 1]) self.assertEqual("HateSpeechDetection", path_components[len(path_components) - 2])