예제 #1
0
    def test_get_type_no_pages(self):
        """
        Test getting the type of a simple page.
        """

        type = info.types([])
        self.assertEqual({}, type)
예제 #2
0
    def test_get_type_list(self):
        """
        Test getting the type of a page that is a list of articles.
        """

        page = 'List of works by Michelangelo'
        type = info.types(page)
        self.assertEqual(info.ArticleType.LIST, type[page])
예제 #3
0
    def test_get_type_redirect(self):
        """
        Test getting the type of a page that redirects returns the input page.
        """

        page = 'Olympique Lyon'
        type = info.types(page)
        self.assertEqual(info.ArticleType.NORMAL, type[page])
예제 #4
0
    def test_get_type_disambiguation_page(self):
        """
        Test getting the type of a page that is actually a disambiguation.
        """

        page = 'Garcia'
        type = info.types(page)
        self.assertEqual(info.ArticleType.DISAMBIGUATION, type[page])
예제 #5
0
    def test_get_type_missing_page(self):
        """
        Test getting the type of a page that doesn't exist.
        """

        page = 'Rudi Garcia (French coach)'
        type = info.types(page)
        self.assertEqual(info.ArticleType.MISSING, type[page])
예제 #6
0
    def test_get_type(self):
        """
        Test getting the type of a simple page.
        """

        page = 'Rudi Garcia'
        type = info.types(page)
        self.assertEqual(info.ArticleType.NORMAL, type[page])
예제 #7
0
    def test_get_type_multiple_pages(self):
        """
        Test getting the types of multiple pages returns information about all pages.
        """

        pages = ['Bordeaux', 'Lyon']
        types = info.types(pages)
        self.assertEqual(len(pages), len(types))
        self.assertEqual(set(pages), set(list(types.keys())))
        self.assertTrue(
            all(type == info.ArticleType.NORMAL for type in types.values()))
예제 #8
0
    def _resolve_unambiguous_candidates(self, candidates):
        """
        Resolve the candidates that are unambiguous.
        The function handles three possiblities:

            #. There are candidates that have a page, and therefore the candidate is resolved to them.
            #. Others have a disambiguation page and are thus ambiguous.
               These candidates are resolved elsewhere.
            #. Other candidates return an empty result.
               In this case, they are said to be unresolved.

        :param candidates: The candidates to resolve.
                           The candidates should be in the form of a dictionary.
                           The keys should be the candidates, and the values the scores.
        :type candidates: dict

        :return: A tuple containing the resolved, unresolved and ambiguous candidates respectively.
        :rtype: tuple of lists
        """

        resolved_candidates, unresolved_candidates, ambiguous_candidates = [], [], []

        for candidate in candidates:
            text = info.types([candidate])
            for page, type in text.items():
                """
                Some pages resolve directly, though may need to redirect.
                Those pages are retained unchanged to respect domain discourse.
                """
                if type is info.ArticleType.NORMAL:
                    resolved_candidates.append(candidate)
                    break
                elif type is info.ArticleType.DISAMBIGUATION:
                    ambiguous_candidates.append(candidate)
                    break
            """
            If the candidate could not be resolved or if it does not have a disambiguation, the candidate cannot be resolved.
            """
            if (candidate not in resolved_candidates
                    and candidate not in ambiguous_candidates):
                unresolved_candidates.append(candidate)

        return resolved_candidates, unresolved_candidates, ambiguous_candidates
예제 #9
0
    def test_get_type_many_pages(self):
        """
        Test getting the types of many pages returns (more than the stagger value) information about all pages.
        """

        pages = [
            'Anthony Lopes', 'Mapou Yanga-Mbiwa', 'Joachim Andersen', 'Rafael',
            'Jason Denayer', 'Marcelo', 'Martin Terrier', 'Houssem Aouar',
            'Moussa Dembélé', 'Bertrand Traoré', 'Memphis Depay',
            'Thiago Mendes', 'Léo Dubois', 'Oumar Solet',
            'Jeff Reine-Adélaïde', 'Rayan Cherki', 'Bruno Guimarães',
            'Amine Gouiri', 'Marçal', 'Karl Toko Ekambi', 'Jean Lucas',
            'Kenny Tete', 'Maxence Caqueret', 'Camilo Reijers de Oliveira',
            'Maxwel Cornet', 'Youssouf Koné', 'Lucas Tousart',
            'Ciprian Tătărușanu', 'Boubacar Fofana'
        ]
        types = info.types(pages)
        self.assertEqual(len(pages), len(types))
        self.assertEqual(set(pages), set(list(types.keys())))
        self.assertEqual(info.ArticleType.DISAMBIGUATION, types['Rafael'])
예제 #10
0
    def resolve(self, candidates, *args, **kwargs):
        """
        Resolve the given candidates.
        They are sorted according to their score.

        :param candidates: The candidates to resolve.
        :type candidates: list

        :return: A tuple containing the resolved and unresolved candidates respectively.
        :rtype: tuple of lists
        """

        resolved_candidates, unresolved_candidates = [], []
        """
        Get the possible pages for each candidate.
        From each of these pages, remove the brackets because this information is secondary.
        If there are years outside the brackets, then the page can be excluded.
        Most often, pages with years in them are not entities.
        Unfortunately, exceptions exist, such as with the name `TSG 1899 Hoffenheim`.
        """
        candidates = sorted(candidates.keys(),
                            key=lambda candidate: candidates.get(candidate),
                            reverse=True)
        for candidate in candidates:
            """
            The page name is retained as-is when checking the year.
            If a page had brackets in it, they are retained.
            They are only removed temporarily to check if the non-bracket part has a year in it.
            In this way, the information about pages and their text can be collected.
            """
            pages = search.collect(candidate, limit=5)
            pages = [
                page for page in pages
                if not self._has_year(self._remove_brackets(page))
            ]
            """
            Fetch the page types.
            Disambiguation, list or missing pages are removed altogether.
            If any pages remain at this point, get their text and score the pages based on relevance to the corpus.
            """
            types = info.types(pages)
            pages = [
                page for page, type in types.items()
                if type is info.ArticleType.NORMAL
            ]
            if len(pages):
                articles = text.collect(pages, introduction_only=True)
                candidate_document = Document(
                    candidate,
                    self.tokenizer.tokenize(candidate),
                    scheme=self.scheme)
                """
                To calculate the score, bracketed text is removed since they do not convey important information.
                Tokens that are part of the candidate name are removed from the sentence.
                """
                scores = {}
                for page, introduction in articles.items():
                    introduction = self._remove_brackets(introduction)
                    sentence = self._get_first_sentence(introduction)
                    tokens = self.tokenizer.tokenize(sentence)
                    tokens = [
                        token for token in tokens
                        if token not in candidate_document.dimensions
                    ]
                    sentence_document = Document(introduction,
                                                 tokens,
                                                 scheme=self.scheme)

                    title_document = Document(page,
                                              self.tokenizer.tokenize(page),
                                              scheme=self.scheme)
                    scores[page] = self._compute_score(candidate_document,
                                                       title_document,
                                                       sentence_document)
                """
                Get the most relevant article.
                If it exceeds the threshold, then the candidate is resolved to that article.
                If it fails to exceed the threshold, the candidate is added to the unresolved candidates.
                """
                article, score = sorted(scores.items(),
                                        key=lambda score: score[1],
                                        reverse=True)[0]
                if score >= self.threshold and article not in resolved_candidates:
                    resolved_candidates.append(article)
                    continue

            unresolved_candidates.append(candidate)

        return (resolved_candidates, unresolved_candidates)