def __create_articles() -> Articles: article1 = Article("1", "Il s'agit d'un titre", "Le résumé numéro un.", ["theme1"], ["theme1", "old_prediction"], []) article2 = Article("2", "Ce sont deux titres", "Le résumé numéro deux.", ["theme1", "theme2", "theme3"], ["other_theme"], []) return Articles([article1, article2])
def test_articles_not_modified_by_predictor(self): """ Test if articles fields 'themes' and 'verified_themes' are not modified by the predictor. :return: """ tokenizer_init_article = Article( id="0", title="", summary="theme1 theme2 theme3", themes=["theme1", "theme2", "theme3"], verified_themes=["theme1", "theme2", "theme3"], predicted_themes=[]) articleOne = Article(id="1", title="", summary="theme1 theme2", themes=["one", "two"], verified_themes=["one", "two", "three"], predicted_themes=["three"]) article_tokenizer = ArticleTextTokenizer( Articles([tokenizer_init_article]), 3) theme_tokenizer = ArticleThemeTokenizer( Articles([tokenizer_init_article])) predictor = ArticlePredictor( classifier_model=MockModel.get_model(), supported_themes=["theme1", "theme2", "theme3"], preprocessor=MockPreprocessor(), article_tokenizer=article_tokenizer, theme_tokenizer=theme_tokenizer) prediction = predictor.predict_preprocessed( Articles(article=articleOne)) article_with_predictions = prediction.get_articles_with_predictions( )[0] self.assertEqual(["one", "two"], article_with_predictions.themes) self.assertEqual(["one", "two", "three"], article_with_predictions.verified_themes) self.assertEqual(["theme1", "theme2"], article_with_predictions.predicted_themes)
def __init__(self, X: List[List[Optional[Any]]], Y: List[List[Optional[Any]]], articles: Articles, validation_ratio: float, batch_size): """ Creates and wrap a tensorflow dataset. :param X: Input :param Y: Outputs :param validation_ratio: :param batch_size: """ if len(X) == 0: raise Exception("X matrix has not rows!") self.row_count: int = len(X) self.article_length: int = len(X[0]) self.theme_count: int = len(Y[0]) self.train_ratio: float = 1 - validation_ratio self.train_size = math.ceil(self.train_ratio * self.row_count) self.validation_size = math.ceil(validation_ratio * self.row_count) self.train_batch_count = int(math.ceil(self.train_size / batch_size)) self.validation_batch_count = int( math.ceil(self.validation_size / batch_size)) self.X_train = X[:self.train_size] self.Y_train = Y[:self.train_size] self.X_val = X[self.train_size:] self.Y_val = Y[self.train_size:] self.articles_train: Articles = Articles(articles[:self.train_size]) self.articles_validation: Articles = Articles( articles[self.train_size:]) # tf.Datasets creation # Only train shuffle. Not needed to evaluate. self.trainData = tf.data.Dataset.from_tensor_slices((self.X_train, self.Y_train))\ .shuffle(len(self.X_train))\ .batch(batch_size)\ .repeat() self.validationData = tf.data.Dataset.from_tensor_slices((self.X_val, self.Y_val))\ .batch(batch_size)\ .repeat()
def __init__(self, articles: Articles): self.tokenizer = Tokenizer() self.tokenizer.fit_on_texts(articles.themes()) self.one_hot_matrix = self.tokenizer.texts_to_matrix(articles.themes()) # Remove the first column, whose first col contains only 0s. self.one_hot_matrix = np.delete(arr=self.one_hot_matrix, obj=0, axis=1) # Create ordered list of theme as in tokenizer self.orderedThemes: List[str] = [] for i in range(1, len(self.tokenizer.word_index) + 1): # word_index start at 1, 0 is reserved. self.orderedThemes.append(self.tokenizer.index_word[i]) self.themes_count = len(self.tokenizer.word_index)
def __init__(self, articles: Articles, max_article_length: int): self.tokenizer = Tokenizer() self.tokenizer.fit_on_texts(articles.title_and_summary()) self.max_article_length: int = max_article_length self.sequences = self.transform_to_sequences(articles) self.voc_size = len( self.tokenizer.word_index) + 1 # +1 because we pad with 0. self.document_count = self.tokenizer.document_count
def transform_to_sequences( self, preprocessed_articles: Articles) -> List[List[Optional[Any]]]: """Transform articles content to a padded vector of length "max_article_length".""" matrix = self.tokenizer.texts_to_sequences( preprocessed_articles.title_and_summary()) matrix = keras.preprocessing.sequence.pad_sequences( matrix, value=0, padding='post', maxlen=self.max_article_length) return matrix
def process_articles(self, articles: Articles) -> Articles: """ Remove stopwords and do lemmatization on each article, for the specified language. :param articles: data_models to process :param LANG: language of the data_models :return: processed data_models """ p = Pool(8) return Articles(p.map(self.process_article, articles.items))
class ArticlesPrediction: raw_predictions: Dict[str, List[float]] = {} __theme_tokenizer: ArticleThemeTokenizer articles: Articles = Articles() def __init__(self, theme_tokenizer: ArticleThemeTokenizer, articles: Articles): self.__theme_tokenizer = theme_tokenizer self.articles = articles def addPredictionsForArticle(self, predictions: List[float], article_id: str): """ Add the predictions for that article. :param article_id: :param predictions: """ self.raw_predictions[article_id] = predictions def get_articles_with_predictions(self, threshold: float = 0.5) -> Articles: return self.__apply_on_articles(threshold) def __apply_on_article(self, article: Article, threshold: float): """ Apply the predictions on an article. :param article: :param threshold: Min probability to consider a theme as positively predicted. """ if article.id not in self.raw_predictions.keys(): raise Exception("No prediction found for that article (%s)", article.id) article.predicted_themes = self.__transform_to_themes(self.raw_predictions[article.id], threshold) def __apply_on_articles(self, threshold: float) -> Articles: """ Apply the predictions on articles. :param threshold: Min probability to consider a theme as positively predicted. """ articles = self.articles.deep_copy() for article in articles: self.__apply_on_article(article, threshold) return articles def __transform_to_themes(self, predictions: List[float], threshold: float) -> List[str]: """ Transforms predictions that are under a form of probabilities into a list of them in a string form. :param predictions: Predictions for a single article :param threshold: Min probability to consider a theme as positively predicted. :return: """ boolean_vector = list(map(lambda probability: probability >= threshold, predictions)) return self.__theme_tokenizer.boolean_vector_to_themes(boolean_vector)
def process_article(self, article: Article) -> Article: """ Remove stopwords and do lemmatization on one single article, for the specified language. :param article: article to process :return: processed article """ while(True): try: return self.__execute_swift_program(Articles(article=article)).items[0] except: continue
def test_save(self): article1 = Article("1", "title", "summary", ["theme1", "theme2", "theme3"], [], []) article2 = Article("2", "title", "summary", ["theme1", "theme4"], [], []) articles = Articles([article1, article2]) tokenizer = ArticleThemeTokenizer(articles) tokenizer.save("test.json") with open("test.json", "r") as file: content = file.readlines() print("f")
def __execute_swift_program(self, articles: Articles) -> Articles: (input_file, input_path) = tempfile.mkstemp() (output_file, output_path) = tempfile.mkstemp() os.close(input_file) os.close(output_file) articles.save(input_path) self.logger.info(f"Articles about to be processed available at {input_path}.") command_directory = os.path.dirname(os.path.abspath(__file__)) command_path = f"{command_directory}/ArticlePreprocessorTool" with subprocess.Popen([command_path, input_path, output_path], stdout=subprocess.PIPE) as process: while True: output = process.stdout.readline() #print(output) if process.poll() is not None: break if output: print(output.strip(), end="\r") print("", end="\r") self.logger.info("Finished processing %d articles.", articles.count()) self.logger.info(f"Preprocessed articles available at {output_path}.") try: processed_articles = Articles.from_file(output_path) self.failed_attemps = 0 return processed_articles except JSONDecodeError: self.logger.error(f"Failed to read the processed articles.... trying again (attemp {self.failed_attemps})") self.failed_attemps += 1 if self.failed_attemps > 5: raise else: return self.__execute_swift_program(articles)
def predict(self, articles_original: Articles) -> ArticlesPrediction: """ Pre-processes articles, compute the predictions for each of them and aggregate the predictions into a ArticlesPrediction object, which is returned. :param articles_original: NON-preprocessed articles """ predictions = ArticlesPrediction(self.theme_tokenizer, articles_original) processed_articles = Articles([article for article in self.preprocessor.process_articles(articles_original)]) self.logger.debug("Will start predictions with keras model") matrix = self.article_tokenizer.transform_to_sequences(processed_articles) prediction_matrix = self.classifier_model.predict(matrix) self.logger.debug("Did predictions with keras model") idx = 0 for prediction_vector in prediction_matrix: article_id = processed_articles[idx].id predictions.addPredictionsForArticle(prediction_vector, article_id) idx += 1 self.logger.info("Finished predicting themes for %d articles", articles_original.count()) return predictions
def test_substraction(self): articles = self.create_articles() articles_to_remove = Articles(self.create_articles()[0:2]) filtered_articles = articles - articles_to_remove self.assertEqual(filtered_articles.count() + 2, articles.count()) self.assertFalse(filtered_articles.contains(articles_to_remove[0].id)) self.assertFalse(filtered_articles.contains(articles_to_remove[1].id)) self.assertTrue(filtered_articles.contains(articles[2].id)) self.assertTrue(filtered_articles.contains(articles[3].id)) self.assertTrue(filtered_articles.contains(articles[4].id)) self.assertTrue(filtered_articles.contains(articles[5].id))
def test_boolean_vector_to_themes(self): article1 = Article("1", "title", "summary", ["theme1", "theme2", "theme3"], [], []) article2 = Article("2", "title", "summary", ["theme1", "theme4"], [], []) articles = Articles([article1, article2]) tokenizer = ArticleThemeTokenizer(articles) self.assertEqual(4, tokenizer.themes_count) self.assertEqual(["theme1", "theme2", "theme3", "theme4"], tokenizer.orderedThemes) self.assertEqual(["theme1", "theme4"], tokenizer.boolean_vector_to_themes( [True, False, False, True])) self.assertEqual([], tokenizer.boolean_vector_to_themes( [False, False, False, False])) self.assertEqual(["theme3"], tokenizer.boolean_vector_to_themes( [False, False, True, False]))
def create_articles() -> Articles: article1 = Article(title="Title", summary="summary", themes=[], verified_themes=[], predicted_themes=[], id="1") article2 = Article(title="Title", summary="summary", themes=["T"], verified_themes=["T"], predicted_themes=[], id="2") article3 = Article(title="Title", summary="summary", themes=["T", "T2"], verified_themes=[], predicted_themes=[], id="3") article4 = Article(title="Title", summary="summary", themes=[], verified_themes=["T"], predicted_themes=[], id="4") article5 = Article(title="Title", summary="summary", themes=["T2"], verified_themes=["T"], predicted_themes=[], id="5") article6 = Article(title="Title", summary="summary", themes=["T", "T2", "T3"], verified_themes=["T", "T2", "T3"], predicted_themes=["T3"], id="6") return Articles( [article1, article2, article3, article4, article5, article6])
def predict_preprocessed(self, processed_articles: Articles) -> ArticlesPrediction: """ Compute the predictions for articles of them and aggregate the predictions into a ArticlesPrediction object, which is returned. Articles must have been previously pre-processed! :param processed_articles: Preprocessed articles """ predictions = ArticlesPrediction(self.theme_tokenizer, processed_articles) self.logger.debug("Will start predictions with keras model") matrix = self.article_tokenizer.transform_to_sequences(processed_articles) prediction_matrix = self.classifier_model.predict(matrix) self.logger.debug("Did predictions with keras model") idx = 0 for prediction_vector in prediction_matrix: article_id = processed_articles[idx].id predictions.addPredictionsForArticle(prediction_vector, article_id) idx += 1 self.logger.info("Finished predicting themes for %d articles", processed_articles.count()) return predictions
from tensorflow.keras.models import load_model ARTICLE_JSON_FILE = "articles_{}.json" LANG = "fr" LANG_FULL = "french" MODEL_PATH = "model.h5" # Relative path LIMIT_ARTICLE_COUNT = None # None or a number. SUPPORTED_THEMES: List[str] = ["computer", "smartphone"] # Loads the articles # ================== articles_filepath = ARTICLE_JSON_FILE.format(LANG) if (LIMIT_ARTICLE_COUNT is None): all_articles: Articles = Articles.from_file(articles_filepath) else: all_articles: Articles = Articles.from_file(articles_filepath, LIMIT_ARTICLE_COUNT) # Load the model # ================== model = load_model(MODEL_PATH, custom_objects={"WeightedBinaryCrossEntropy" : WeightedBinaryCrossEntropy()}) # Perform evaluation # ================== F1AUCModelEvaluator().evaluate(all_articles, SUPPORTED_THEMES)
def testApplyOnArticlesDefaultThreshold(self): article1 = Article("1", "title", "summary", ["theme1"], ["theme1", "old_prediction"], ["theme1"]) article2 = Article("2", "title", "summary", ["theme1", "theme2"], ["other__old_predicted_theme"], ["theme1", "theme2", "theme3"]) # article 3 is not used for test, but is necessary for the tokenizer to know the theme3. article3 = Article("3", "title", "summary", ["theme3"], [], []) articles = Articles([article1, article2]) theme_tokenizer = ArticleThemeTokenizer( Articles([article1, article2, article3])) predictions = ArticlesPrediction(theme_tokenizer, articles) predictions.addPredictionsForArticle([0.1, 0.7, 0], article1.id) predictions.addPredictionsForArticle([0.4, 0.89, 0.99], article2.id) # Apply prediction with standard threshold predicted_articles = predictions.get_articles_with_predictions() predicted_articles_one = predicted_articles[0] predicted_articles_two = predicted_articles[1] self.assertEqual(1, len(predicted_articles_one.predicted_themes)) self.assertFalse("theme1" in predicted_articles_one.predicted_themes) self.assertTrue("theme2" in predicted_articles_one.predicted_themes) self.assertFalse("theme3" in predicted_articles_one.predicted_themes) self.assertEqual(2, len(predicted_articles_two.predicted_themes)) self.assertFalse("theme1" in predicted_articles_two.predicted_themes) self.assertTrue("theme2" in predicted_articles_two.predicted_themes) self.assertTrue("theme3" in predicted_articles_two.predicted_themes) # Check that the 'verified themes' and 'themes' are not touched! self.assertEqual(1, len(predicted_articles_one.themes)) self.assertTrue("theme1" in predicted_articles_one.themes) self.assertFalse("theme2" in predicted_articles_one.themes) self.assertFalse("theme3" in predicted_articles_one.themes) self.assertEqual(2, len(predicted_articles_two.themes)) self.assertTrue("theme1" in predicted_articles_two.themes) self.assertTrue("theme2" in predicted_articles_two.themes) self.assertFalse("theme3" in predicted_articles_two.themes) self.assertEqual(1, len(predicted_articles_one.verified_themes)) self.assertTrue("theme1" in predicted_articles_one.verified_themes) self.assertFalse("theme2" in predicted_articles_one.verified_themes) self.assertFalse("theme3" in predicted_articles_one.verified_themes) self.assertEqual(3, len(predicted_articles_two.verified_themes)) self.assertTrue("theme1" in predicted_articles_two.verified_themes) self.assertTrue("theme2" in predicted_articles_two.verified_themes) self.assertTrue("theme3" in predicted_articles_two.verified_themes) # Apply prediction with custom threshold predicted_articles = predictions.get_articles_with_predictions(0.09) predicted_articles_one = predicted_articles[0] predicted_articles_two = predicted_articles[1] self.assertEqual(2, len(predicted_articles_one.predicted_themes)) self.assertTrue("theme1" in predicted_articles_one.predicted_themes) self.assertTrue("theme2" in predicted_articles_one.predicted_themes) self.assertFalse("theme3" in predicted_articles_one.predicted_themes) self.assertEqual(3, len(predicted_articles_two.predicted_themes)) self.assertTrue("theme1" in predicted_articles_two.predicted_themes) self.assertTrue("theme2" in predicted_articles_two.predicted_themes) self.assertTrue("theme3" in predicted_articles_two.predicted_themes)
def process_articles(self, articles: Articles) -> Articles: return articles.deep_copy()
debugLogger.info( "\n\n\n####################################\n####################################" ) ############################################ # Data loading ############################################ # Loading the file # ============================ debugLogger.info("Loading the file") articles_filepath = ARTICLE_JSON_FILE.format(LANG) if LIMIT_ARTICLES_TRAINING: all_articles: Articles = Articles.from_file(articles_filepath, 600) else: all_articles: Articles = Articles.from_file(articles_filepath) all_articles.shuffle() for article in all_articles: article.make_immutable() # Data filtering and partitionning # ============================ articles_train: Articles = all_articles.articles_with_all_verified_themes( [SUPPORTED_THEME]).deep_copy() # Removal of all unsupported themes and keep only data_models who have at least one supported theme.