def crawl(site, trm , depth, linksfile): from java.net import URL from org.w3c.tidy import Tidy pattern = re.compile('href="/wiki/(.*?)"') f = open(linksfile, 'a+') #try: if depth < MAX_DEPTH: print 'crawling [%s]...' % trm, print >> f, '[%s]' % trm td = Tidy() td.setXmlOut(1) u = URL(site + trm) input = BufferedInputStream(u.openStream()) output = ByteArrayOutputStream() #tidy.setInputEncoding("UTF8") #tidy.setOutputEncoding("UTF8") td.parse(input, output) content = output.toString() hits = pattern.findall(content) for hit in hits: if hit.find(":") == -1: print >> f, hit print 'done.' print >> f, '' for hit in hits: if hit.find(":") == -1: crawl(site, hit, depth + 1, linksfile) #except: # print "wrong" f.close()