def __init__(self, urlstr, debug=0): """ get HTML contents at a given url 'urlstr' """ self.debug = debug self.geturl = geturl.geturl(urlstr) if debug: print '### DATA size:', len(self.geturl.data) self.parser = hparser.hparser(self.geturl.baseurl, debug=1) self.parser.feed(self.geturl.data) self.parser.close() if debug: print '### Got :', len(self.parser.data) print self.parser.data self.parser.analyze() print '#' * 50, '\n'
def __init__(self, urlstr, debug=0): """ get HTML contents at a given url 'urlstr' """ self.debug = debug self.geturl = geturl.geturl(urlstr) if debug: print '### DATA size:', len(self.geturl.data) self.parser = hparser.hparser(self.geturl.baseurl, debug=1) self.parser.feed( self.geturl.data ) self.parser.close() if debug: print '### Got :', len(self.parser.data) print self.parser.data self.parser.analyze() print '#'*50,'\n'
theurl = 'http://m.knpu.org/hoho' txdata = None # txdata is BODY txheaders = client.header.header_table # txheaders is header #assert None try: # First URL -> CHILD URL Search req = urllib2.Request(theurl, txdata, txheaders) handle = urllib2.urlopen(req) data = handle.read() print data # GET URL parser = hparser.hparser(theurl, debug=1) parser.feed(data) parser.close() #print parser.prn_anchors() lists = parser.get_anchors() # Make Dictionary URL : DepthLevel visited_list = [] url_table = {} depth = 2 seq_num = 1 prefix = 'knpu.org' prefix = 'prefixtest.com' for homepage in lists: url_table[ homepage[0] ] = 1 # First Depth Level
for i in req.unredirected_hdrs: length = length + len(i) + len(req.unredirected_hdrs[i]) # print req.unredirected_hdrs # print req.header_items() # print req.has_header('Cookie') # print "=" * 50, 'End' data = handle_output.read() if handle_output.headers.getheader('Content-Type') != \ 'text/html': # Content-Type 이 아니라면 Skip continue #print data try: parser = hparser.hparser(handle_output.geturl(), debug=1) parser.feed(data) parser.close() lists = parser.get_anchors() # lists 에 body의 URL 저장 except: pass for homepage in lists: if urlparse.urlparse(homepage[0]) [0] != 'http': print urlparse.urlparse(homepage[0]) [0] continue url_table[ homepage[0] ] = current_depth + 1 #print 'DEBUG 4444'