def _fetchLink(self, link, parentDir): parser = miniHTMLParser() self.log.debug("Checking link %s" % link) while link: # Get the file from the site and link input = urllib.urlopen(link) out = None contentType = input.info().gettype() isHtml = contentType == 'text/html' #print contentType if isHtml: parser.setBaseUrl(input.geturl()) else: parsed = urlparse.urlparse(link) hp = parsed[1] h = hp p = None if hp.find(':') != -1: h, p = hp.split(':', 1) path = parsed[2] path = path.split('/') file = os.path.join(parentDir, h, p) for c in path: if c == '': continue file = os.path.join(file, c) try: self.log.debug('Creating %s' % file) dir, tail = os.path.split(file) if not os.path.exists(dir): os.makedirs(dir) except: self.log.debug(get_exception_string()) out = open(file, 'w') bufSz = 8192 buf = input.read(bufSz) while len(buf) > 0: if isHtml: # Feed the file into the HTML parser parser.feed(buf) if out: out.write(buf) buf = input.read(bufSz) input.close() if out: out.close() # Search the retfile here # Get the next link in level traversal order link = parser.getNextLink() parser.close()
def _fetchLink(self, link, parentDir): parser = miniHTMLParser() self.log.debug("Checking link %s" % link) while link: # Get the file from the site and link input = urllib.urlopen(link) out = None contentType = input.info().gettype() isHtml = contentType == "text/html" # print contentType if isHtml: parser.setBaseUrl(input.geturl()) else: parsed = urlparse.urlparse(link) hp = parsed[1] h = hp p = None if hp.find(":") != -1: h, p = hp.split(":", 1) path = parsed[2] path = path.split("/") file = os.path.join(parentDir, h, p) for c in path: if c == "": continue file = os.path.join(file, c) try: self.log.debug("Creating %s" % file) dir, tail = os.path.split(file) if not os.path.exists(dir): os.makedirs(dir) except: self.log.debug(get_exception_string()) out = open(file, "w") bufSz = 8192 buf = input.read(bufSz) while len(buf) > 0: if isHtml: # Feed the file into the HTML parser parser.feed(buf) if out: out.write(buf) buf = input.read(bufSz) input.close() if out: out.close() # Search the retfile here # Get the next link in level traversal order link = parser.getNextLink() parser.close()
def __collect_jobtracker_ui(self, dir): link = self.mapredInfo + "/jobtracker.jsp" parser = miniHTMLParser() parser.setBaseUrl(self.mapredInfo) node_cache = {} self.__log.debug("collect_jobtracker_ui seeded with " + link) def alarm_handler(number, stack): raise AlarmException("timeout") signal.signal(signal.SIGALRM, alarm_handler) input = None while link: self.__log.debug("link: %s" % link) # taskstats.jsp,taskdetails.jsp not included since too many to collect if re.search( "jobfailures\.jsp|jobtracker\.jsp|jobdetails\.jsp|jobtasks\.jsp", link): for i in range(1,5): if hodInterrupt.isSet(): raise HodInterruptException() try: input = urllib.urlopen(link) break except: self.__log.debug(get_exception_string()) time.sleep(1) if input: out = None self.__log.debug("collecting " + link + "...") filename = re.sub(self.mapredInfo, "", link) filename = dir + "/" + filename filename = re.sub("http://","", filename) filename = re.sub("[\?\&=:]","_",filename) filename = filename + ".html" try: tempdir, tail = os.path.split(filename) if not os.path.exists(tempdir): os.makedirs(tempdir) except: self.__log.debug(get_exception_string()) out = open(filename, 'w') bufSz = 8192 signal.alarm(10) try: self.__log.debug("Starting to grab: %s" % link) buf = input.read(bufSz) while len(buf) > 0: # Feed the file into the HTML parser parser.feed(buf) # Re-write the hrefs in the file p = re.compile("\?(.+?)=(.+?)") buf = p.sub(r"_\1_\2",buf) p= re.compile("&(.+?)=(.+?)") buf = p.sub(r"_\1_\2",buf) p = re.compile("http://(.+?):(\d+)?") buf = p.sub(r"\1_\2/",buf) buf = re.sub("href=\"/","href=\"",buf) p = re.compile("href=\"(.+?)\"") buf = p.sub(r"href=\1.html",buf) out.write(buf) buf = input.read(bufSz) signal.alarm(0) input.close() if out: out.close() self.__log.debug("Finished grabbing: %s" % link) except AlarmException: if hodInterrupt.isSet(): raise HodInterruptException() if out: out.close() if input: input.close() self.__log.debug("Failed to retrieve: %s" % link) else: self.__log.debug("Failed to retrieve: %s" % link) # Get the next link in level traversal order link = parser.getNextLink() parser.close()