def label(self, start: int = 0, end: int = None): if end is None: end = len(self.dataframe) dataframe = self.dataframe.iloc[start:end] for index, row in dataframe.iterrows(): title = row["title"] text = row["original_text"] persons = list(filter(lambda person: len(person) > 2, row["persons"])) if len(persons) > 0: personPattern = re.compile("|".join(persons), re.IGNORECASE) title = personPattern.sub("<Person>", title) text = personPattern.sub("<Person>", text) title = self.partyPattern.sub("<Partei>", title) text = self.partyPattern.sub("<Partei>", text) mediaPattern = re.compile("|".join(["bild", "bildplus", "taz", "tagesschau"]), re.IGNORECASE) title = mediaPattern.sub("<Zeitung>", title) text = mediaPattern.sub("<Zeitung>", text) print("================================================") print(title) print("++++++++++++++++++++++++++++++++++++++++++++++++") print(text) print("================================================") self.get_polarity_input(dataframe, index) self.get_subjectivity_input(dataframe, index) Writer.write_dataframe(dataframe, "labeled_paragraphs")
def train_threshold(self) -> float: """ Train the threshold with labeled data. :return The best threshold. """ threshold: float = 0 best_threshold: float = 0 best_score: float = 0 thresholds: List[float] = [] f1_scores: List[Tuple[float, float, float, float]] = [] self.tfidf_sentiment.get_context_polarity(8) self.tfidf_sentiment.calculate_sentiment_score(overwrite=True) # Iterate over different thresholds, increase with every loop while threshold <= 0.005: self.tfidf_sentiment.map_sentiment(threshold=threshold, overwrite=True) # Optimize over the sum of all f1 scores for SentiWs f1_sentiws, _, _ = self.f1_score(training=True) f1_sum = f1_sentiws[0] + f1_sentiws[1] + f1_sentiws[2] thresholds.append(threshold) f1_scores.append((f1_sum, f1_sentiws[0], f1_sentiws[1], f1_sentiws[2])) # Replace best threshold if current one is better if f1_sum > best_score: best_score = f1_sum best_threshold = threshold threshold += 0.0000001 # Visualize the training self.visualize_threshold(thresholds, f1_scores, best_threshold, 0.005) # Adjust the sentiment with best threshold self.tfidf_sentiment.map_sentiment(threshold=best_threshold, overwrite=True) Writer.write_dataframe(self.dataframe, "labeled_paragraphs") return best_threshold
) sys.exit() comparison = Comparison(labeled_file) # Train the score threshold optimal_threshold = comparison.train_threshold() print("Optimal threshold: {}\n".format(optimal_threshold)) # Train the window and the score threshold optimal_context_thresholds = comparison.train_context_thresholds() print("Optimal context thresholds: {} (window), {} (score)\n".format( optimal_context_thresholds[0], optimal_context_thresholds[1])) # Save paragraphs to disk Writer.write_dataframe(df_paragraphs, "paragraphs") # Show GUI if args.show_gui: gui = SentimentGUI(df_paragraphs) gui.show_gui() # Compare labeled data with results if args.compare: labeled_file = Path("src/output/labeled_paragraphs.json") if not labeled_file.exists(): print( 'You have to provide a labeled file "labeled_paragraphs.json" for comparison in the output folder' ) sys.exit()
def train_context_thresholds(self) -> Tuple[float, float]: """ Train the context threshold (SentiWs with context polarity) with labeled data. :return The best thresholds for window size and score. """ window_threshold: int = 0 best_window_threshold: int = 0 best_score_threshold: float = 0 best_score: float = 0 thresholds: List[float] = [] f1_scores: List[Tuple] = [] # Iterate over different window thresholds, increase with every loop while window_threshold <= 35: self.tfidf_sentiment.get_context_polarity(window_threshold) self.tfidf_sentiment.calculate_sentiment_score(overwrite=True) score_threshold: float = 0 # Save best temp scores for visualization best_temp_score_f1_sum: float = 0 best_temp_score_f1_scores: Tuple = () # Iterate over different score thresholds, increase with every loop while score_threshold < 0.001: self.tfidf_sentiment.map_sentiment(overwrite=True, threshold=score_threshold) self.dataframe = self.tfidf_sentiment.df_paragraphs # Optimize over the sum of all f1 scores for context sentiment _, _, f1_context = self.f1_score(training=True) f1_sum = f1_context[0] + f1_context[1] + f1_context[2] # Replace best temp thresholds for visualization if current ones are better if f1_sum > best_temp_score_f1_sum: best_temp_score_f1_sum = f1_sum best_temp_score_f1_scores = (f1_sum, f1_context[0], f1_context[1], f1_context[2]) # Replace best thresholds if current ones are better if f1_sum > best_score: best_score = f1_sum best_window_threshold = window_threshold best_score_threshold = score_threshold score_threshold += 0.00001 thresholds.append(window_threshold) f1_scores.append(best_temp_score_f1_scores) window_threshold += 1 # Visualize the training self.visualize_threshold(thresholds, f1_scores, best_window_threshold, 35) # Adjust the sentiment with best thresholds self.tfidf_sentiment.get_context_polarity(best_window_threshold) self.tfidf_sentiment.calculate_sentiment_score(overwrite=True) self.tfidf_sentiment.map_sentiment(overwrite=True, threshold=best_score_threshold) Writer.write_dataframe(self.dataframe, "labeled_paragraphs") return best_window_threshold, best_score_threshold