Python miniHTMLParser 예제들, hodlib.Common.miniHTMLParser.miniHTMLParser Python 예제들

예제 #1

0

파일 보기

    def _fetchLink(self, link, parentDir):
        parser = miniHTMLParser()
        self.log.debug("Checking link %s" % link)
        while link:

            # Get the file from the site and link
            input = urllib.urlopen(link)
            out = None
            contentType = input.info().gettype()
            isHtml = contentType == 'text/html'

            #print contentType
            if isHtml:
                parser.setBaseUrl(input.geturl())
            else:
                parsed = urlparse.urlparse(link)
                hp = parsed[1]
                h = hp
                p = None
                if hp.find(':') != -1:
                    h, p = hp.split(':', 1)
                path = parsed[2]
                path = path.split('/')
                file = os.path.join(parentDir, h, p)
                for c in path:
                    if c == '':
                        continue
                    file = os.path.join(file, c)

                try:
                    self.log.debug('Creating %s' % file)
                    dir, tail = os.path.split(file)
                    if not os.path.exists(dir):
                        os.makedirs(dir)
                except:
                    self.log.debug(get_exception_string())

                out = open(file, 'w')

            bufSz = 8192
            buf = input.read(bufSz)
            while len(buf) > 0:
                if isHtml:
                    # Feed the file into the HTML parser
                    parser.feed(buf)
                if out:
                    out.write(buf)
                buf = input.read(bufSz)

            input.close()
            if out:
                out.close()

            # Search the retfile here

            # Get the next link in level traversal order
            link = parser.getNextLink()

        parser.close()

예제 #2

0

파일 보기

파일: ringMaster.py 프로젝트: JichengSong/hadoop-20

    def _fetchLink(self, link, parentDir):
        parser = miniHTMLParser()
        self.log.debug("Checking link %s" % link)
        while link:

            # Get the file from the site and link
            input = urllib.urlopen(link)
            out = None
            contentType = input.info().gettype()
            isHtml = contentType == "text/html"

            # print contentType
            if isHtml:
                parser.setBaseUrl(input.geturl())
            else:
                parsed = urlparse.urlparse(link)
                hp = parsed[1]
                h = hp
                p = None
                if hp.find(":") != -1:
                    h, p = hp.split(":", 1)
                path = parsed[2]
                path = path.split("/")
                file = os.path.join(parentDir, h, p)
                for c in path:
                    if c == "":
                        continue
                    file = os.path.join(file, c)

                try:
                    self.log.debug("Creating %s" % file)
                    dir, tail = os.path.split(file)
                    if not os.path.exists(dir):
                        os.makedirs(dir)
                except:
                    self.log.debug(get_exception_string())

                out = open(file, "w")

            bufSz = 8192
            buf = input.read(bufSz)
            while len(buf) > 0:
                if isHtml:
                    # Feed the file into the HTML parser
                    parser.feed(buf)
                if out:
                    out.write(buf)
                buf = input.read(bufSz)

            input.close()
            if out:
                out.close()

            # Search the retfile here

            # Get the next link in level traversal order
            link = parser.getNextLink()

        parser.close()

예제 #3

0

파일 보기

파일: hadoop.py 프로젝트: 3rdandUrban-dev/hadoop-20

  def __collect_jobtracker_ui(self, dir):

     link = self.mapredInfo + "/jobtracker.jsp"
     parser = miniHTMLParser()
     parser.setBaseUrl(self.mapredInfo)
     node_cache = {}

     self.__log.debug("collect_jobtracker_ui seeded with " + link)

     def alarm_handler(number, stack):
         raise AlarmException("timeout")
       
     signal.signal(signal.SIGALRM, alarm_handler)

     input = None
     while link:
       self.__log.debug("link: %s" % link)
       # taskstats.jsp,taskdetails.jsp not included since too many to collect
       if re.search(
         "jobfailures\.jsp|jobtracker\.jsp|jobdetails\.jsp|jobtasks\.jsp", 
         link):

         for i in range(1,5):
           if hodInterrupt.isSet():
             raise HodInterruptException()
           try:
             input = urllib.urlopen(link)
             break
           except:
             self.__log.debug(get_exception_string())
             time.sleep(1)
  
         if input:
           out = None
    
           self.__log.debug("collecting " + link + "...")
           filename = re.sub(self.mapredInfo, "", link)
           filename = dir + "/"  + filename
           filename = re.sub("http://","", filename)
           filename = re.sub("[\?\&=:]","_",filename)
           filename = filename + ".html"
    
           try:
             tempdir, tail = os.path.split(filename)
             if not os.path.exists(tempdir):
               os.makedirs(tempdir)
           except:
             self.__log.debug(get_exception_string())
    
           out = open(filename, 'w')
           
           bufSz = 8192
           
           signal.alarm(10)
           
           try:
             self.__log.debug("Starting to grab: %s" % link)
             buf = input.read(bufSz)
      
             while len(buf) > 0:
               # Feed the file into the HTML parser
               parser.feed(buf)
        
         # Re-write the hrefs in the file
               p = re.compile("\?(.+?)=(.+?)")
               buf = p.sub(r"_\1_\2",buf)
               p= re.compile("&(.+?)=(.+?)")
               buf = p.sub(r"_\1_\2",buf)
               p = re.compile("http://(.+?):(\d+)?")
               buf = p.sub(r"\1_\2/",buf)
               buf = re.sub("href=\"/","href=\"",buf)
               p = re.compile("href=\"(.+?)\"")
               buf = p.sub(r"href=\1.html",buf)
 
               out.write(buf)
               buf = input.read(bufSz)
      
             signal.alarm(0)
             input.close()
             if out:
               out.close()
               
             self.__log.debug("Finished grabbing: %s" % link)
           except AlarmException:
             if hodInterrupt.isSet():
               raise HodInterruptException()
             if out: out.close()
             if input: input.close()
             
             self.__log.debug("Failed to retrieve: %s" % link)
         else:
           self.__log.debug("Failed to retrieve: %s" % link)
         
       # Get the next link in level traversal order
       link = parser.getNextLink()

     parser.close()

예제 #4

0

파일 보기

파일: hadoop.py 프로젝트: NikkitaSh30/i-mapreduce

  def __collect_jobtracker_ui(self, dir):

     link = self.mapredInfo + "/jobtracker.jsp"
     parser = miniHTMLParser()
     parser.setBaseUrl(self.mapredInfo)
     node_cache = {}

     self.__log.debug("collect_jobtracker_ui seeded with " + link)

     def alarm_handler(number, stack):
         raise AlarmException("timeout")
       
     signal.signal(signal.SIGALRM, alarm_handler)

     input = None
     while link:
       self.__log.debug("link: %s" % link)
       # taskstats.jsp,taskdetails.jsp not included since too many to collect
       if re.search(
         "jobfailures\.jsp|jobtracker\.jsp|jobdetails\.jsp|jobtasks\.jsp", 
         link):

         for i in range(1,5):
           if hodInterrupt.isSet():
             raise HodInterruptException()
           try:
             input = urllib.urlopen(link)
             break
           except:
             self.__log.debug(get_exception_string())
             time.sleep(1)
  
         if input:
           out = None
    
           self.__log.debug("collecting " + link + "...")
           filename = re.sub(self.mapredInfo, "", link)
           filename = dir + "/"  + filename
           filename = re.sub("http://","", filename)
           filename = re.sub("[\?\&=:]","_",filename)
           filename = filename + ".html"
    
           try:
             tempdir, tail = os.path.split(filename)
             if not os.path.exists(tempdir):
               os.makedirs(tempdir)
           except:
             self.__log.debug(get_exception_string())
    
           out = open(filename, 'w')
           
           bufSz = 8192
           
           signal.alarm(10)
           
           try:
             self.__log.debug("Starting to grab: %s" % link)
             buf = input.read(bufSz)
      
             while len(buf) > 0:
               # Feed the file into the HTML parser
               parser.feed(buf)
        
         # Re-write the hrefs in the file
               p = re.compile("\?(.+?)=(.+?)")
               buf = p.sub(r"_\1_\2",buf)
               p= re.compile("&(.+?)=(.+?)")
               buf = p.sub(r"_\1_\2",buf)
               p = re.compile("http://(.+?):(\d+)?")
               buf = p.sub(r"\1_\2/",buf)
               buf = re.sub("href=\"/","href=\"",buf)
               p = re.compile("href=\"(.+?)\"")
               buf = p.sub(r"href=\1.html",buf)
 
               out.write(buf)
               buf = input.read(bufSz)
      
             signal.alarm(0)
             input.close()
             if out:
               out.close()
               
             self.__log.debug("Finished grabbing: %s" % link)
           except AlarmException:
             if hodInterrupt.isSet():
               raise HodInterruptException()
             if out: out.close()
             if input: input.close()
             
             self.__log.debug("Failed to retrieve: %s" % link)
         else:
           self.__log.debug("Failed to retrieve: %s" % link)
         
       # Get the next link in level traversal order
       link = parser.getNextLink()

     parser.close()