Пример #1
0
    def getLinkFromStringLink(self, originatingLink, stringLink):
        domain = ""
        path = ""
        
        newlink = urlparse(stringLink)
        
        # If the link does not contain a domain, we use the domain of the originating link
        if newlink.netloc == "":
            domain = originatingLink.domain
        else:
            domain = newlink.netloc
        
        path = newlink.path
        
        # Add arguments to path, if found
        if newlink.query != "":
            path = path + "?" + newlink.query
        
        # Anchors within same document and relative links are ignored
        if path.startswith("#"):
            return None
        elif path.startswith(".."):
            return None
        
        # Create a link object        
        link = Link.linkFromDomainAndPath(domain, path)

        return link
    def getVisitedLinks(self, num):
        self.mutex.acquire()
        links = []
        domain_index = 1
        path_index = 2
        
        try:
            query = 'SELECT * FROM CrawledLinks EXCEPT (SELECT * FROM UnvisitedLinks) LIMIT ?'
            arguments = (num,)
            
            cursor = self.connection.cursor()
            cursor.execute(query, arguments)
            allentries = cursor.fetchall()

            for row in allentries:
                domain = str(row[domain_index])
                path = str(row[path_index])
                link = Link.linkFromDomainAndPath(domain, path)
                links.append(link)
        except Exception, e:
            pass