def parsework(docid, path, textdir, workdir, q): sortkeys = "-k 1,1 -k 2,2n -k 3,3n -k 4,4n -k 5,5n -k 6,6n -k 7,7n -k 8,8n" # (docid,path,textdir,workdir,q) = info f = open(path) filename = os.path.basename(path) origpath = os.path.abspath(path) newpath = textdir + filename os.system("cp %s %s" % (origpath, newpath)) outpath = workdir + filename + ".raw" o = codecs.open(outpath, "w", "utf-8") print "parsing %d : %s" % (docid, filename) parser = AbstractParser.AbstractParser(filename, docid) r = parser.parse(f, o) wordcommand = "cat %s | egrep \"^word \" | cut -d \" \" -f 2,3,4,5,6,7,8,9,10,11 | sort %s > %s" % ( outpath, sortkeys, workdir + filename + ".words.sorted") os.system(wordcommand) tomsfile = workdir + filename + ".toms" Toms.mktoms(open(outpath), open(tomsfile, "w")) sortedtomsfile = workdir + filename + ".toms.sorted" os.system("cat %s | sort -k 1,1n -k 2,2n -k 3,3n -k 4,4n -k 5,5n > %s" % (tomsfile, sortedtomsfile)) q.put(r) sys.exit()
parser = TEIParser.TEIParser(filename, docid) parser.parse(f, o) fileinfo.append({"path": path, "name": filename, "raw": outpath}) print "parsed %d files successfully.\nsorting..." % len(fileinfo) for file in fileinfo: print "sorting %s" % file["name"] file["words"] = file["path"] + ".words.sorted" wordcommand = "cat %s | egrep \"^word \" | cut -d \" \" -f 2,3,4,5,6,7,8,9,10,11 | sort %s > %s" % ( file["raw"], sortkeys, file["words"]) os.system(wordcommand) for file in fileinfo: print "building metadata for %s" % file["name"] file["toms"] = file["path"] + ".toms" Toms.mktoms(open(file["raw"], "r"), open(file["toms"], "w")) file["sortedtoms"] = file["path"] + ".toms.sorted" tomscommand = "cat %s | sort -k 1,1n -k 2,2n -k 3,3n -k 4,4n -k 5,5n > %s" % ( file["toms"], file["sortedtoms"]) os.system(tomscommand) print "done sorting individual files.\nmerging..." wordfilearg = " ".join(file["words"] for file in fileinfo) os.system("sort -m %s %s > %s" % (sortkeys, wordfilearg, workdir + "/all.words.sorted")) os.system("sort -m -k 1,1n -k 2,2n -k 3,3n -k 4,4n %s > %s" % (" ".join(file["sortedtoms"] for file in fileinfo), workdir + "/all.toms.sorted")) print "done merging.\nnow analyzing for compression...."
print "parsed %d files successfully.\nsorting..." % len(fileinfo) for file in fileinfo: print "sorting %s" % file["name"] file["words"] = file["path"] + ".words.sorted" wordcommand = 'cat %s | egrep "^word " | cut -d " " -f 2,3,4,5,6,7,8,9,10,11 | sort %s > %s' % ( file["raw"], sortkeys, file["words"], ) os.system(wordcommand) for file in fileinfo: print "building metadata for %s" % file["name"] file["toms"] = file["path"] + ".toms" Toms.mktoms(open(file["raw"], "r"), open(file["toms"], "w")) file["sortedtoms"] = file["path"] + ".toms.sorted" tomscommand = "cat %s | sort -k 1,1n -k 2,2n -k 3,3n -k 4,4n -k 5,5n > %s" % (file["toms"], file["sortedtoms"]) os.system(tomscommand) print "done sorting individual files.\nmerging..." wordfilearg = " ".join(file["words"] for file in fileinfo) os.system("sort -m %s %s > %s" % (sortkeys, wordfilearg, workdir + "/all.words.sorted")) os.system( "sort -m -k 1,1n -k 2,2n -k 3,3n -k 4,4n %s > %s" % (" ".join(file["sortedtoms"] for file in fileinfo), workdir + "/all.toms.sorted") ) print "done merging.\nnow analyzing for compression...." words = open(workdir + "/all.words.sorted")