def _get_matched_and_unmatched_contents(self): if len(self.text_contents) == 0 or len(self.media_contents) == 0: return { "matched_contents": [], "unused_contents": self.text_contents if len(self.text_contents) != 0 else self.media_contents, "unused_content_type": "text" if len(self.text_contents) != 0 else "media" } preprocessor = TextMediaMatchingPreprocessor(self.text_contents, self.media_contents) preprocessed_contents_dict = preprocessor.get_formatted_content() text_for_matching = preprocessed_contents_dict["sentences"] media_for_matching = preprocessed_contents_dict["media"] unused_contents \ = preprocessed_contents_dict["content_unused_for_matching"] unused_content_type = preprocessed_contents_dict["unused_content_type"] matcher = TextMediaMatchingHelper(text_for_matching, media_for_matching, self.distance_metric_type) matched_contents = matcher.get_text_media_matching() return { "matched_contents": matched_contents, "unused_contents": unused_contents, "unused_content_type": unused_content_type }
def test_text_media_preprocessor_return_format(): ''' Tests if the return format of the preprocessor is correct''' preprocessor = TextMediaMatchingPreprocessor([sentence_1], [media_related_to_sentence_1]) preprocessed_contents_dict = preprocessor.get_formatted_content() assert isinstance(preprocessed_contents_dict, dict) assert "sentences" in preprocessed_contents_dict assert "media" in preprocessed_contents_dict assert "content_unused_for_matching" in preprocessed_contents_dict
def test_text_media_preprocessor_eliminates_sentences(): ''' Tests if pre-processor eliminates extra sentences''' preprocessor = TextMediaMatchingPreprocessor([sentence_1, sentence_2], [media_related_to_sentence_2]) preprocessed_contents_dict = preprocessor.get_formatted_content() assert preprocessed_contents_dict["sentences"] == [sentence_2] assert preprocessed_contents_dict["media"] == [media_related_to_sentence_2] assert preprocessed_contents_dict["content_unused_for_matching"] == [ sentence_1 ]
def test_text_media_preprocessor_prunes_list(): ''' Tests the pruning functionality''' preprocessor = TextMediaMatchingPreprocessor( [sentence_1, sentence_2], [media_related_to_sentence_1, media_related_to_sentence_2]) # the first 'count_content_to_eliminate' contents will be eliminated preprocessor.count_of_content_to_eliminate = 1 filtered_list = preprocessor._get_pruned_list( [1, 0, 2], # index 1 should be eliminated ['list_item1', 'list_item2', 'list_item3']) assert filtered_list == ['list_item1', 'list_item3'] assert preprocessor.content_unused_for_matching == ['list_item2']
def test_text_media_preprocessor_returns_sentence_list_when_media_is_empty(): ''' Sentence list is content unused when media list is empty ''' preprocessor = TextMediaMatchingPreprocessor([sentence_1], []) preprocessed_contents_dict = preprocessor.get_formatted_content() assert preprocessed_contents_dict["sentences"] == [] assert preprocessed_contents_dict["media"] == [] assert preprocessed_contents_dict["content_unused_for_matching"] == [ sentence_1 ]
def test_text_media_preprocessor_returns_empty_unused_content_when_content_count_is_equal( ): # noqa ''' Unused content is empty when sentence and media count are equal ''' preprocessor = TextMediaMatchingPreprocessor( [sentence_1, sentence_2], [media_related_to_sentence_1, media_related_to_sentence_2]) preprocessed_contents_dict = preprocessor.get_formatted_content() assert preprocessed_contents_dict["sentences"] == [sentence_2, sentence_1] assert preprocessed_contents_dict["media"] == [ media_related_to_sentence_1, media_related_to_sentence_2 ] assert preprocessed_contents_dict["content_unused_for_matching"] == []