Пример #1
0
    def parse_url(self, url):
        """ Returns a tuple of (method, host, url) """
        (method, host, url, query_string, fragment) = urlnorm.parse(url)

        if query_string:
            url = "%s?%s" % (url, query_string)

        try:
            host = host or self.host
        except AttributeError:
            dbh = DB.DBO(self.case)
            dbh.execute("select host, url from http where inode_id = %r", self.inode_id)
            row = dbh.fetch()
            if row:
                self.host = host = row['host']
                self.url = row['url']
            else:
                self.host = host = self.url = url = ''

        ## Url may be relative to the present directory
        if not url.startswith("/"):
            dirname = os.path.dirname(self.url)
            if not dirname.endswith("/"): dirname = dirname+"/"
            url = dirname + url

        return host, url
Пример #2
0
    def parse_url(self, url):
        """ Returns a tuple of (method, host, url) """
        (method, host, url, query_string, fragment) = urlnorm.parse(url)

        if query_string:
            url = "%s?%s" % (url, query_string)

        try:
            host = host or self.host
        except AttributeError:
            dbh = DB.DBO(self.case)
            dbh.execute("select host, url from http where inode_id = %r",
                        self.inode_id)
            row = dbh.fetch()
            if row:
                self.host = host = row['host']
                self.url = row['url']
            else:
                self.host = host = self.url = url = ''

        ## Url may be relative to the present directory
        if not url.startswith("/"):
            dirname = os.path.dirname(self.url)
            if not dirname.endswith("/"): dirname = dirname + "/"
            url = dirname + url

        return host, url
Пример #3
0
def insertIntoDB(links):

	global cursor2

	if len(links) > 0:
		for link in links:

			# We throw away fragments
			try:
				portions = urlnorm.parse(link)

			# We should either try to handle this and/or have a db for bad links
			except UnicodeEncodeError:
				continue

			link = urlparse.urlunsplit((portions[0], portions[1], portions[2], portions[3], ""))
			cursor2.execute("SELECT url FROM data WHERE url=%s", (link))
			row = cursor2.fetchone()
			crawlerID = random.randrange(0,100)

			if row == None:
				cursor2.execute("INSERT INTO data (url, timestamp_added, crawler_id) VALUES (%s, %s, %s)", (link, time.time(), crawlerID))