f = os.path.join(dir,f) if os.path.isfile(f): txt = open(f,'rb').read() m = re.search('<title>(.*?)</title>',txt) if m: title = m.group(1) if ':' not in title: yield f if __name__=="__main__": if len(sys.argv) < 2: print "Usage: $formatter.py File" print "Usage: $formatter.py Direcotory" sys.exit() p = xml.sax.make_parser() contentHandler = subTree("text", process) p.setContentHandler(contentHandler) if os.path.isfile(Input): currentFile = Input fh = open(Input,'r') p.parse(fh) fh.close() else: Input = os.path.abspath(Input) for f in articles(Input): print f currentFile = f fh = open(f,'r') p.parse(fh) fh.close()
def generateNewXML(text): text = '<?xml version="1.0" encoding="utf-8"?>'+ text text = text.encode("utf-8") m = re.search('<id>(.*?)</id>', text) if m: fName = outputPath+"/"+m.group(1) print fName open(fName,'w').write(text) # xml.sax.parseString(text, contentHandler) # tmpFile ="/tmp/_987654321.xml" # tmpHandler = open(tmpFile,'w') # tmpHandler.write(text2.encode("utf-8")) # tmpHandler.close() # tmpHandler2 = open(tmpFile, 'r') # parser2.parse(tmpHandler2) # tmpHandler2.close() def puts(text): print text if __name__=="__main__": if len(sys.argv) < 3: print "Usage: $fragmenter.py wikipediaDump outputDirectory" sys.exit() outputPath = sys.argv[2] parser = xml.sax.make_parser() parser.setContentHandler(subTree("page", generateNewXML)) parser.parse(open(sys.argv[1],"r"))