Пример #1
0
    def _selectImportantLinks_Freq(self, mainArticleName, interimLinks, aliases, content, secondScanLimit = _SECOND_SCAN_LIMIT,
                                   maxLinkLimit = _MAX_LINKS_LIMIT):
        """
        This class selects the most important links based on the frequency of them or their aliases appearing in the
        text. The output of this function should be bounded and be no more than __MAX_LINKS_LIMIT. This is accomplished by
        tightening the bound of acceptable frequencies for links.


        Input Parameters:
            interimLinks A lists of tuples in the form of (link,freq) that shows the frequency of a single link in
            the text. These are the initial links that have been gathered but should be filtered further.
            aliases: A map of all the aliases that are known. This map is in the form of { "name" = "Alias" } which shows
             an Alias for a given name. The aliases map should be constructed when the text is searched for links
            content: The actual wikiText content to search in. This is the result of the retrieval of RAW data from
             Wikipedia.


        Returns:
             A list of filtered tuples in the form of (link,freq) that are selected from interimLinks based
            on their frequencies.
        """

        #Search the content for the link or its aliases and get the total number of frequencies for each link.
        finalLinks = []
        wiki = Wiki()
        for (topLink, freq) in interimLinks:
            newFreq = self._searchContent(topLink, freq, aliases, content)
            isDoubleLink = wiki.doesPageContainLink(mainArticleName,topLink)
            #TODO remove this later
            print isDoubleLink
            if isDoubleLink:
                newFreq *= self._DOUBLE_LINK_MULTIPLIER
            finalLinks.append((topLink, newFreq))

        #Here we have a large number of links and frequencies. We go through them to just select no more than
        # __MAX_LINKS_LIMIT. Start from __SECOND_SCAN_LIMIT and in each step make the threshold of acceptable frequencies
        #tighter until the number of links is smaller than __MAX_LINKS_LIMIT .
        step = 0
        while  len(finalLinks) >= maxLinkLimit:
            finalLinks = [(link, freq) for (link, freq) in finalLinks if freq > secondScanLimit + step]
            step += 1
        return finalLinks