def parse_url(self, url): """ Returns a tuple of (method, host, url) """ (method, host, url, query_string, fragment) = urlnorm.parse(url) if query_string: url = "%s?%s" % (url, query_string) try: host = host or self.host except AttributeError: dbh = DB.DBO(self.case) dbh.execute("select host, url from http where inode_id = %r", self.inode_id) row = dbh.fetch() if row: self.host = host = row['host'] self.url = row['url'] else: self.host = host = self.url = url = '' ## Url may be relative to the present directory if not url.startswith("/"): dirname = os.path.dirname(self.url) if not dirname.endswith("/"): dirname = dirname+"/" url = dirname + url return host, url
def parse_url(self, url): """ Returns a tuple of (method, host, url) """ (method, host, url, query_string, fragment) = urlnorm.parse(url) if query_string: url = "%s?%s" % (url, query_string) try: host = host or self.host except AttributeError: dbh = DB.DBO(self.case) dbh.execute("select host, url from http where inode_id = %r", self.inode_id) row = dbh.fetch() if row: self.host = host = row['host'] self.url = row['url'] else: self.host = host = self.url = url = '' ## Url may be relative to the present directory if not url.startswith("/"): dirname = os.path.dirname(self.url) if not dirname.endswith("/"): dirname = dirname + "/" url = dirname + url return host, url
def insertIntoDB(links): global cursor2 if len(links) > 0: for link in links: # We throw away fragments try: portions = urlnorm.parse(link) # We should either try to handle this and/or have a db for bad links except UnicodeEncodeError: continue link = urlparse.urlunsplit((portions[0], portions[1], portions[2], portions[3], "")) cursor2.execute("SELECT url FROM data WHERE url=%s", (link)) row = cursor2.fetchone() crawlerID = random.randrange(0,100) if row == None: cursor2.execute("INSERT INTO data (url, timestamp_added, crawler_id) VALUES (%s, %s, %s)", (link, time.time(), crawlerID))