def getLinkFromStringLink(self, originatingLink, stringLink): domain = "" path = "" newlink = urlparse(stringLink) # If the link does not contain a domain, we use the domain of the originating link if newlink.netloc == "": domain = originatingLink.domain else: domain = newlink.netloc path = newlink.path # Add arguments to path, if found if newlink.query != "": path = path + "?" + newlink.query # Anchors within same document and relative links are ignored if path.startswith("#"): return None elif path.startswith(".."): return None # Create a link object link = Link.linkFromDomainAndPath(domain, path) return link
def getVisitedLinks(self, num): self.mutex.acquire() links = [] domain_index = 1 path_index = 2 try: query = 'SELECT * FROM CrawledLinks EXCEPT (SELECT * FROM UnvisitedLinks) LIMIT ?' arguments = (num,) cursor = self.connection.cursor() cursor.execute(query, arguments) allentries = cursor.fetchall() for row in allentries: domain = str(row[domain_index]) path = str(row[path_index]) link = Link.linkFromDomainAndPath(domain, path) links.append(link) except Exception, e: pass