示例#1
0
    def _get_matched_and_unmatched_contents(self):
        if len(self.text_contents) == 0 or len(self.media_contents) == 0:
            return {
                "matched_contents": [],
                "unused_contents":
                self.text_contents
                if len(self.text_contents) != 0 else self.media_contents,
                "unused_content_type":
                "text" if len(self.text_contents) != 0 else "media"
            }

        preprocessor = TextMediaMatchingPreprocessor(self.text_contents,
                                                     self.media_contents)
        preprocessed_contents_dict = preprocessor.get_formatted_content()

        text_for_matching = preprocessed_contents_dict["sentences"]
        media_for_matching = preprocessed_contents_dict["media"]
        unused_contents \
            = preprocessed_contents_dict["content_unused_for_matching"]
        unused_content_type = preprocessed_contents_dict["unused_content_type"]

        matcher = TextMediaMatchingHelper(text_for_matching,
                                          media_for_matching,
                                          self.distance_metric_type)
        matched_contents = matcher.get_text_media_matching()

        return {
            "matched_contents": matched_contents,
            "unused_contents": unused_contents,
            "unused_content_type": unused_content_type
        }
def test_text_media_preprocessor_return_format():
    ''' Tests if the return format of the preprocessor is correct'''
    preprocessor = TextMediaMatchingPreprocessor([sentence_1],
                                                 [media_related_to_sentence_1])
    preprocessed_contents_dict = preprocessor.get_formatted_content()
    assert isinstance(preprocessed_contents_dict, dict)
    assert "sentences" in preprocessed_contents_dict
    assert "media" in preprocessed_contents_dict
    assert "content_unused_for_matching" in preprocessed_contents_dict
def test_text_media_preprocessor_eliminates_sentences():
    ''' Tests if pre-processor eliminates extra sentences'''
    preprocessor = TextMediaMatchingPreprocessor([sentence_1, sentence_2],
                                                 [media_related_to_sentence_2])
    preprocessed_contents_dict = preprocessor.get_formatted_content()

    assert preprocessed_contents_dict["sentences"] == [sentence_2]
    assert preprocessed_contents_dict["media"] == [media_related_to_sentence_2]
    assert preprocessed_contents_dict["content_unused_for_matching"] == [
        sentence_1
    ]
def test_text_media_preprocessor_prunes_list():
    ''' Tests the pruning functionality'''
    preprocessor = TextMediaMatchingPreprocessor(
        [sentence_1, sentence_2],
        [media_related_to_sentence_1, media_related_to_sentence_2])
    # the first 'count_content_to_eliminate' contents will be eliminated
    preprocessor.count_of_content_to_eliminate = 1
    filtered_list = preprocessor._get_pruned_list(
        [1, 0, 2],  # index 1 should be eliminated
        ['list_item1', 'list_item2', 'list_item3'])
    assert filtered_list == ['list_item1', 'list_item3']
    assert preprocessor.content_unused_for_matching == ['list_item2']
def test_text_media_preprocessor_returns_sentence_list_when_media_is_empty():
    ''' Sentence list is content unused
    when media list is empty
    '''
    preprocessor = TextMediaMatchingPreprocessor([sentence_1], [])
    preprocessed_contents_dict = preprocessor.get_formatted_content()

    assert preprocessed_contents_dict["sentences"] == []
    assert preprocessed_contents_dict["media"] == []
    assert preprocessed_contents_dict["content_unused_for_matching"] == [
        sentence_1
    ]
def test_text_media_preprocessor_returns_empty_unused_content_when_content_count_is_equal(
):  # noqa
    ''' Unused content is empty when sentence and media
    count are equal
    '''
    preprocessor = TextMediaMatchingPreprocessor(
        [sentence_1, sentence_2],
        [media_related_to_sentence_1, media_related_to_sentence_2])
    preprocessed_contents_dict = preprocessor.get_formatted_content()

    assert preprocessed_contents_dict["sentences"] == [sentence_2, sentence_1]
    assert preprocessed_contents_dict["media"] == [
        media_related_to_sentence_1, media_related_to_sentence_2
    ]
    assert preprocessed_contents_dict["content_unused_for_matching"] == []