Пример #1
0
    def _findAssociation_ReadArticleFirst(self, articles, rankLimit=7):

        self.wiki = Wiki()
        allLinksMultiSet = {}

        wikiReader = WikiTextReader()

        for articleTitle in articles:
            content = self.wiki.getArticle(articleTitle)
            links = wikiReader.readLinks(articleTitle, content, 0, 0, 100000)
            onlyLinks = [link for (link, freq) in links]
            allLinksMultiSet[articleTitle] = collections.Counter(onlyLinks)

        return self._findSharedLinks(allLinksMultiSet, articles, rankLimit)
Пример #2
0
    def _selectImportantLinks_Freq(self, mainArticleName, interimLinks, aliases, content, secondScanLimit = _SECOND_SCAN_LIMIT,
                                   maxLinkLimit = _MAX_LINKS_LIMIT):
        """
        This class selects the most important links based on the frequency of them or their aliases appearing in the
        text. The output of this function should be bounded and be no more than __MAX_LINKS_LIMIT. This is accomplished by
        tightening the bound of acceptable frequencies for links.


        Input Parameters:
            interimLinks A lists of tuples in the form of (link,freq) that shows the frequency of a single link in
            the text. These are the initial links that have been gathered but should be filtered further.
            aliases: A map of all the aliases that are known. This map is in the form of { "name" = "Alias" } which shows
             an Alias for a given name. The aliases map should be constructed when the text is searched for links
            content: The actual wikiText content to search in. This is the result of the retrieval of RAW data from
             Wikipedia.


        Returns:
             A list of filtered tuples in the form of (link,freq) that are selected from interimLinks based
            on their frequencies.
        """

        #Search the content for the link or its aliases and get the total number of frequencies for each link.
        finalLinks = []
        wiki = Wiki()
        for (topLink, freq) in interimLinks:
            newFreq = self._searchContent(topLink, freq, aliases, content)
            isDoubleLink = wiki.doesPageContainLink(mainArticleName,topLink)
            #TODO remove this later
            print isDoubleLink
            if isDoubleLink:
                newFreq *= self._DOUBLE_LINK_MULTIPLIER
            finalLinks.append((topLink, newFreq))

        #Here we have a large number of links and frequencies. We go through them to just select no more than
        # __MAX_LINKS_LIMIT. Start from __SECOND_SCAN_LIMIT and in each step make the threshold of acceptable frequencies
        #tighter until the number of links is smaller than __MAX_LINKS_LIMIT .
        step = 0
        while  len(finalLinks) >= maxLinkLimit:
            finalLinks = [(link, freq) for (link, freq) in finalLinks if freq > secondScanLimit + step]
            step += 1
        return finalLinks
Пример #3
0
    def getImportantLinks(self,
                          articleTitle,
                          selectionAlgorithm=SelectionAlgorithm.PageRank,
                          outputLimit=15):
        """Retrieves the most important links in an article based on a specified algorithm


        This is the function that retrieves and ranks items from wikipedia. This function always combines the results
        with a bag of words algorithm.


        The bag of words algorithm is run automatically when a wikiReader reads links. It goes through two steps of
        first identifying all links than selecting the most frequent of those links in the wikiText.


        Right now page ranks takes some time to finish but this should not be a problem. A Hadoop server with MapReduce
        and a sophisticated caching mechanisms along with an index database will significantly improve the speed to a
        matter of miliseconds.


        Input Parameters:
        articleTitle : The title of the article to retrieve and rank the links for
        selectionAlgorithm : The algorithm to use for ranking alongside bag of words
        outputLimit: This specifies how many links should be ranked and returned


        Returns:
        A list containing top links titles. (the number or links equals to the outputLimit input parameter passed in)
        """

        #Get article content
        self.wiki = Wiki()
        articleContent = self.wiki.getArticle(articleTitle)

        #Read all the links from the wikiText
        wikiReader = WikiTextReader()
        links = wikiReader.readLinks(articleTitle, articleContent)

        #Select the ranking algorithm and run it in the all links that are retrieved
        selectionAlg = getattr(self, "_selectLinks_%s" % selectionAlgorithm)
        return selectionAlg(links, outputLimit)
Пример #4
0
    def _findAssociation_SharedLinksFirst(self, articles, rankLimit=7):
        """ The algorithm for finding soft associations between a list of articles

        Input Parameters:
        articles: A list containing all the article names that you want to find associations between
        rankLimit: Determines how many top shared article associations should be returned for the list of articles
        given. The function may return at most rankLimit items.

        returns:
        An Adjacency list representation of the graph that associates all the articles with intermediate articles and
        the original articles as vertices.
        """

        self.wiki = Wiki()
        allLinksMultiSet = {}

        #Create a multi set for each article links.
        for articleTitle in articles:
            allLinksMultiSet[articleTitle] = collections.Counter(
                self.wiki.getLinks(articleTitle))

        return self._findSharedLinks(allLinksMultiSet, articles, rankLimit)
Пример #5
0
        #before in the body of the text increase its initial frequency counter; Otherwise, add a new entry to the cleanLinks
        #map. This map contains {nameOfTheLink = frequencyOfTheLink} and will be updated as new links are encountered.
        cleanLinks = {}
        aliases = {}
        for link in dirtyLinks:
            if link.find(':') == -1:
                stringItems = link.split('|')
                key = stringItems[0]
                if cleanLinks.has_key(key):
                    cleanLinks[key] += 1
                else:
                    cleanLinks[key] = 1
                    if len(stringItems) > 1:
                        aliases[key] = stringItems[1]
        #Links that appear only once in the text should be filtered out.
        interimLinks = [ (topLink, freq) for (topLink, freq) in cleanLinks.items() if freq > firstScanLimit]

        #For the remaining links adjustments must be made so that we get the most important links and the result should
        #be returned.
        return self._selectImportantLinks_Freq(articleName, interimLinks, aliases, content, secondScanLimit, maxLinkLimit)


if __name__ == "__main__":
    from wikiadapter import Wiki
    wiki = Wiki()
    articleName = "Shine On You Crazy Diamond"
    content = wiki.getArticle(articleName)
    reader = WikiTextReader()
    print len(reader.readLinks(articleName,content))
    print ["%s=%s" % (link, freq) for (link, freq) in reader.readLinks(articleName,content)]