Exemplo n.º 1
0
def parsework(docid, path, textdir, workdir, q):
    sortkeys = "-k 1,1 -k 2,2n -k 3,3n -k 4,4n -k 5,5n -k 6,6n -k 7,7n -k 8,8n"
    #        (docid,path,textdir,workdir,q) = info
    f = open(path)
    filename = os.path.basename(path)
    origpath = os.path.abspath(path)
    newpath = textdir + filename
    os.system("cp %s %s" % (origpath, newpath))
    outpath = workdir + filename + ".raw"
    o = codecs.open(outpath, "w", "utf-8")
    print "parsing %d : %s" % (docid, filename)
    parser = AbstractParser.AbstractParser(filename, docid)
    r = parser.parse(f, o)
    wordcommand = "cat %s | egrep \"^word \" | cut -d \" \" -f 2,3,4,5,6,7,8,9,10,11 | sort %s > %s" % (
        outpath, sortkeys, workdir + filename + ".words.sorted")
    os.system(wordcommand)
    tomsfile = workdir + filename + ".toms"
    Toms.mktoms(open(outpath), open(tomsfile, "w"))
    sortedtomsfile = workdir + filename + ".toms.sorted"
    os.system("cat %s | sort -k 1,1n -k 2,2n -k 3,3n -k 4,4n -k 5,5n > %s" %
              (tomsfile, sortedtomsfile))
    q.put(r)
    sys.exit()
Exemplo n.º 2
0
    parser = TEIParser.TEIParser(filename, docid)
    parser.parse(f, o)
    fileinfo.append({"path": path, "name": filename, "raw": outpath})

print "parsed %d files successfully.\nsorting..." % len(fileinfo)
for file in fileinfo:
    print "sorting %s" % file["name"]
    file["words"] = file["path"] + ".words.sorted"
    wordcommand = "cat %s | egrep \"^word \" | cut -d \" \" -f 2,3,4,5,6,7,8,9,10,11 | sort %s > %s" % (
        file["raw"], sortkeys, file["words"])
    os.system(wordcommand)

for file in fileinfo:
    print "building metadata for %s" % file["name"]
    file["toms"] = file["path"] + ".toms"
    Toms.mktoms(open(file["raw"], "r"), open(file["toms"], "w"))
    file["sortedtoms"] = file["path"] + ".toms.sorted"
    tomscommand = "cat %s | sort -k 1,1n -k 2,2n -k 3,3n -k 4,4n -k 5,5n > %s" % (
        file["toms"], file["sortedtoms"])
    os.system(tomscommand)

print "done sorting individual files.\nmerging..."
wordfilearg = " ".join(file["words"] for file in fileinfo)

os.system("sort -m %s %s > %s" %
          (sortkeys, wordfilearg, workdir + "/all.words.sorted"))
os.system("sort -m -k 1,1n -k 2,2n -k 3,3n -k 4,4n %s > %s" %
          (" ".join(file["sortedtoms"]
                    for file in fileinfo), workdir + "/all.toms.sorted"))
print "done merging.\nnow analyzing for compression...."
Exemplo n.º 3
0
print "parsed %d files successfully.\nsorting..." % len(fileinfo)
for file in fileinfo:
    print "sorting %s" % file["name"]
    file["words"] = file["path"] + ".words.sorted"
    wordcommand = 'cat %s | egrep "^word " | cut -d " " -f 2,3,4,5,6,7,8,9,10,11 | sort %s > %s' % (
        file["raw"],
        sortkeys,
        file["words"],
    )
    os.system(wordcommand)

for file in fileinfo:
    print "building metadata for %s" % file["name"]
    file["toms"] = file["path"] + ".toms"
    Toms.mktoms(open(file["raw"], "r"), open(file["toms"], "w"))
    file["sortedtoms"] = file["path"] + ".toms.sorted"
    tomscommand = "cat %s | sort -k 1,1n -k 2,2n -k 3,3n -k 4,4n -k 5,5n > %s" % (file["toms"], file["sortedtoms"])
    os.system(tomscommand)

print "done sorting individual files.\nmerging..."
wordfilearg = " ".join(file["words"] for file in fileinfo)

os.system("sort -m %s %s > %s" % (sortkeys, wordfilearg, workdir + "/all.words.sorted"))
os.system(
    "sort -m -k 1,1n -k 2,2n -k 3,3n -k 4,4n %s > %s"
    % (" ".join(file["sortedtoms"] for file in fileinfo), workdir + "/all.toms.sorted")
)
print "done merging.\nnow analyzing for compression...."

words = open(workdir + "/all.words.sorted")