def my_open(buf, page, parent=None): dircache = {} vbaiter = None docdata = "" docdataiter = None tbliter = None dirflag = 0 ftype = "" try: if parent is None: gsffilename = page.fname else: # need to save tmp file to pass to gsf gsffilename = "tmp%s" % time.time() f = open(gsffilename, "wb") f.write(buf) f.close() gsfout = subprocess.check_output(["gsf", "list", gsffilename]) print gsfout print "-----------------" for i in gsfout.split("\n")[1:-1]: if i[0] == "f": # gsf sometimes lists date even for files. Or, rather, it # seems that it misrepresents empty dirs as (empty) files. # I have observed this with 'Objects' in many .pub files. if i[5] != ' ': fullname = " ".join(i.split()[4:]) else: fullname = " ".join(i.split()[2:]) if not len(fullname): fullname = " ".join(i.split()[2:]) if "/" in fullname: fns = fullname.split("/") cdir = "/".join(fns[:-1]) fn = fns[-1] else: fn = fullname cdir = "" if len(fn) and ord(fn[0]) < 32: fn = fn[1:] pn = None if cdir: cdir_to_treeiter(page, parent, cdir, dircache) pn = dircache["/" + cdir] data = subprocess.check_output( ["gsf", "cat", gsffilename, fullname]) iter1 = add_pgiter(page, fn, "ole", fn, data, pn) if fn == "DesignerDoc": ftype = "dsf" page.model.set_value(iter1, 1, ("dsf", dirflag)) dsf.open(page, data, iter1) if (fn == "EscherStm" or fn == "EscherDelayStm"): # and infchild.size()>0: ftype = "escher" page.model.set_value(iter1, 1, ("escher", dirflag)) escher.parse( page.model, data, iter1, "pub" ) # currently I don't parse it automagically for MSDOC if fn == "MagicTab": ftype = "wls" page.model.set_value(iter1, 1, ("wls", dirflag)) wls.parse(page, data, iter1) if fn == "CONTENTS": if data[6:11] == "WT602": ftype = "wt602" page.model.set_value(iter1, 1, ("wt602", dirflag)) wt602.parse(page, data, iter1) elif fullname.split('/')[0] == "OleObjects": # Nested OLE objects (or images) in WT602 wt602.parse_object(page, data, iter1) else: ftype = "quill" page.model.set_value(iter1, 1, ("quill", dirflag)) quill.parse(page, data, iter1) if fn == "Contents": if data[:2] == "\xe8\xac": # take signature into account ftype = "pub" page.model.set_value(iter1, 1, ("pub", dirflag)) pub.parse(page, data, iter1) if fn == "VisioDocument": ftype = "vsd" page.model.set_value(iter1, 1, ("vsd", dirflag)) # level = 1? vsd.parse(page, data, iter1) if fn == "PageMaker": ftype = "pm" page.model.set_value(iter1, 1, ("pm", dirflag)) pm6.open(page, data, iter1) if fn == "WordDocument": ftype = "doc" page.model.set_value(iter1, 1, ("doc", dirflag)) #level = 1 doc.parse(page, data, iter1) if fn == "1Table" or fn == "0Table": page.wtable = iter1 if fn == "Data" and page.type == "DOC": page.wdata = iter1 if fn == "Book" or fn == "Workbook": page.model.set_value(iter1, 1, ("xls", dirflag)) ftype = xls.parse(page, data, iter1) if fn == "PowerPoint Document" or fn == "Pictures": ftype = "ppt" page.model.set_value(iter1, 1, ("ppt", dirflag)) ppt.parse(page, data, iter1) if fn == "NativeContent_MAIN": ftype = "qpw" page.model.set_value(iter1, 1, ("qpw", dirflag)) qpw.parse(page, data, iter1) if fn == "Signature" and data[:4] == '\x60\x67\x01\x00': ftype = "ppp" #PagePlus OLE version (9.x?) if (fn == "contents" or fn == "SCFFPreview") and ftype == "ppp": ppp.parse(page, data, iter1, fn) # I've no idea if this is really the signature, but it is # present in all files I've seen so far if fn == "Header" and data[0xc:0xf] == 'xV4': ftype = 'zmf' zmf.zmf2_open(page, data, iter1, fn) if fn[-4:] == '.zmf': ftype = 'zmf' zmf.zmf2_open(page, data, iter1, fn) if fn == "VBA": page.type = "vba" ftype = "vba" #if (ftype == "vba" and fn == "dir") or if "vba/dir" in fullname.lower(): page.model.set_value(iter1, 1, ("vba", dirflag)) vbaiter = iter1 vbadata = data if "SummaryInformation" in fn: page.model.set_value(iter1, 1, ("ole", "propset")) if parent is None: try: os.remove(gsffilename) except: pass else: if i.split()[2] == "VBA": page.type = "vba" ftype = "vba" if vbaiter != None: vba.parse(page, vbadata, vbaiter) except subprocess.CalledProcessError: print "Failed to run gsf. Please install libgsf." return ftype = "TEST" return ftype
def my_open (buf,page,parent=None): dircache = {} vbaiter = None docdata = "" docdataiter = None tbliter = None dirflag=0 ftype = "" tmpfile = None try: if parent is None: gsffilename = page.fname else: # need to save tmp file to pass to gsf (tmpfd, tmpfile) = mkstemp() gsffilename = tmpfile os.write(tmpfd, buf) os.close(tmpfd) gsfout = subprocess.check_output(["gsf", "list", gsffilename]) print gsfout print "-----------------" for i in gsfout.split("\n")[1:-1]: if i[0] == "f": # gsf sometimes lists date even for files. Or, rather, it # seems that it misrepresents empty dirs as (empty) files. # I have observed this with 'Objects' in many .pub files. if i[5] != ' ': fullname = " ".join(i.split()[4:]) else: fullname = " ".join(i.split()[2:]) if not len(fullname): fullname = " ".join(i.split()[2:]) if "/" in fullname: fns = fullname.split("/") cdir = "/".join(fns[:-1]) fn = fns[-1] else: fn = fullname cdir = "" if len(fn) and ord(fn[0]) < 32: fn = fn[1:] if cdir: cdir_to_treeiter(page,parent,cdir,dircache) pn = dircache["/"+cdir] else: pn = parent data = subprocess.check_output(["gsf", "cat", gsffilename, fullname]) iter1 = add_pgiter(page,fn,"ole",fn,data,pn) if fn == "DesignerDoc": ftype = "dsf" page.model.set_value(iter1,1,("dsf",dirflag)) dsf.open (page, data, iter1) if (fn == "EscherStm" or fn == "EscherDelayStm"): # and infchild.size()>0: ftype = "escher" page.model.set_value(iter1,1,("escher",dirflag)) escher.parse (page.model,data,iter1,"pub") # currently I don't parse it automagically for MSDOC if fn == "MagicTab": ftype = "wls" page.model.set_value(iter1,1,("wls",dirflag)) wls.parse (page,data,iter1) if fn == "CONTENTS": if data[6:11] == "WT602": ftype = "wt602" page.model.set_value(iter1,1,("wt602",dirflag)) wt602.parse (page,data,iter1) elif fullname.split('/')[0] == "OleObjects": # Nested OLE objects (or images) in WT602 wt602.parse_object(page, data, iter1) else: ftype = "quill" page.model.set_value(iter1,1,("quill",dirflag)) quill.parse (page,data,iter1) if fn == "Contents": if data[:2] == "\xe8\xac": # take signature into account ftype = "pub" page.model.set_value(iter1,1,("pub",dirflag)) page.appcontentdoc=pub.PublisherContentDoc(page,iter1) page.appcontentdoc.parse(data) if fn == "VisioDocument": ftype = "vsd" page.model.set_value(iter1,1,("vsd",dirflag)) # level = 1? vsd.parse (page, data, iter1) if fn == "PageMaker": ftype = "pm" page.model.set_value(iter1,1,("pm",dirflag)) pm6.open (page, data, iter1) if fn == "WordDocument": ftype = "doc" page.model.set_value(iter1,1,("doc",dirflag)) #level = 1 doc.parse (page, data, iter1) if fn == "1Table" or fn == "0Table": page.wtable = iter1 if fn == "Data" and page.type == "DOC": page.wdata = iter1 if fn == "Book" or fn == "Workbook": page.model.set_value(iter1,1,("xls",dirflag)) ftype = xls.parse (page, data, iter1) if fn == "PowerPoint Document" or fn == "Pictures": ftype = "ppt" page.model.set_value(iter1,1,("ppt",dirflag)) ppt.parse (page, data, iter1) if fn == "NativeContent_MAIN": ftype = "qpw" page.model.set_value(iter1,1,("qpw",dirflag)) qpw.parse (page, data, iter1) if fn == "Signature" and data[:4] == '\x60\x67\x01\x00': ftype = "ppp" #PagePlus OLE version (9.x?) if (fn == "contents" or fn == "SCFFPreview") and ftype == "ppp": ppp.parse(page,data,iter1,fn) # I've no idea if this is really the signature, but it is # present in all files I've seen so far if fn == "Header" and data[0xc:0xf] == 'xV4': ftype = 'zmf' zmf.zmf2_open(page, data, iter1, fn) if fn[-4:] == '.zmf': ftype = 'zmf' zmf.zmf2_open(page, data, iter1, fn) if fn[-4:] == '.BMI' and fullname.split('/')[0] == 'Bitmaps': ftype = 'bmi' bmi.open(data, page, iter1) if fn == "VBA": page.type = "vba" ftype = "vba" #if (ftype == "vba" and fn == "dir") or if "vba/dir" in fullname.lower(): page.model.set_value(iter1,1,("vba",dirflag)) vbaiter = iter1 vbadata = data if "SummaryInformation" in fn: page.model.set_value(iter1,1,("ole","propset")) else: if i.split()[2] == "VBA": page.type = "vba" ftype = "vba" if vbaiter != None: vba.parse (page, vbadata, vbaiter) except subprocess.CalledProcessError: print "Failed to run gsf. Please install libgsf." if tmpfile: try: os.remove(tmpfile) except: pass return ftype
def gsf_get_children(page, infile, parent, ftype, dirflag=0): vbaiter = None docdata = "" docdataiter = None tbliter = None for i in range(infile.num_children()): infchild = infile.child_by_index(i) infname = infile.name_by_index(i) chsize = infchild.size() # print "Name ", infname, dirflag if ord(infname[0]) < 32: infname = infname[1:] if infname == "dir": infuncomp = infchild.uncompress() data = infuncomp.read(infuncomp.size()) else: data = infchild.read(chsize) iter1 = add_pgiter(page, infname, "ole", dirflag, data) if (infname == "EscherStm" or infname == "EscherDelayStm") and chsize > 0: ftype = "escher" page.model.set_value(iter1, 1, ("escher", dirflag)) escher.parse( page.model, data, iter1, "pub") # currently I don't parse it automagically for MSDOC if infname == "CONTENTS": if data[6:11] == "WT602": ftype = "wt602" page.model.set_value(iter1, 1, ("wt602", dirflag)) wt602.parse(page, data, iter1) else: ftype = "quill" page.model.set_value(iter1, 1, ("quill", dirflag)) quill.parse(page, data, iter1) if infname == "Contents": if data and data[:2] == "\xe8\xac": # take signature into account ftype = "pub" page.model.set_value(iter1, 1, ("pub", dirflag)) pub.parse(page, data, iter1) if infname == "VisioDocument": ftype = "vsd" page.model.set_value(iter1, 1, ("vsd", dirflag)) # level = 1? # choose vsd or vsd2 vsd.parse(page, data, iter1) if infname == "PageMaker": ftype = "pm" page.model.set_value(iter1, 1, ("pm", dirflag)) pm6.open(page, data, iter1) if infname == "WordDocument": ftype = "doc" page.model.set_value(iter1, 1, ("doc", dirflag)) #level = 1 doc.parse(page, data, iter1) if infname == "1Table" or infname == "0Table": page.wtable = iter1 if infname == "Data": page.wdata = iter1 if infname == "Book" or infname == "Workbook": page.model.set_value(iter1, 1, ("xls", dirflag)) ftype = xls.parse(page, data, iter1) if infname == "PowerPoint Document" or infname == "Pictures" and data != None: ftype = "ppt" page.model.set_value(iter1, 1, ("ppt", dirflag)) ppt.parse(page, data, iter1) if infname == "NativeContent_MAIN": ftype = "qpw" page.model.set_value(iter1, 1, ("qpw", dirflag)) qpw.parse(page, data, iter1) if infname == "Signature" and data[:4] == '\x60\x67\x01\x00': ftype = "ppp" #PagePlus OLE version (9.x?) if (infname == "contents" or infname == "SCFFPreview") and ftype == "ppp": ppp.parse(page, data, iter1, infname) # I've no idea if this is really the signature, but it is # present in all files I've seen so far if infname == "Header" and data[0xc:0xf] == 'xV4': ftype = 'zmf' zmf.zmf2_open(page, data, iter1, infname) if infname[-4:] == '.zmf': ftype = 'zmf' zmf.zmf2_open(page, data, iter1, infname) if infname == "VBA": page.type = ftype ftype = "vba" if ftype == "vba" and infname == "dir": page.model.set_value(iter1, 1, ("vba", dirflag)) vbaiter = iter1 vbadata = data if (infile.num_children() > 0): page.model.set_value(iter1, 1, (ftype, 1)) gsf_get_children(page, infchild, iter1, ftype, 0) if "SummaryInformation" in infname: page.model.set_value(iter1, 1, ("ole", "propset")) if vbaiter != None: vba.parse(page, vbadata, vbaiter) return ftype
def gsf_get_children(page,infile,parent,ftype,dirflag=0): vbaiter = None docdata = "" docdataiter = None tbliter = None for i in range(infile.num_children()): infchild = infile.child_by_index(i) infname = infile.name_by_index(i) chsize = infchild.size() # print "Name ", infname, dirflag if ord(infname[0]) < 32: infname = infname[1:] if infname == "dir": infuncomp = infchild.uncompress() data = infuncomp.read(infuncomp.size()) else: data = infchild.read(chsize) iter1 = add_pgiter (page, infname, "ole", dirflag, data) if (infname == "EscherStm" or infname == "EscherDelayStm") and chsize>0: ftype = "escher" page.model.set_value(iter1,1,("escher",dirflag)) escher.parse (page.model,data,iter1,"pub") # currently I don't parse it automagically for MSDOC if infname == "CONTENTS": if data[6:11] == "WT602": ftype = "wt602" page.model.set_value(iter1,1,("wt602",dirflag)) wt602.parse (page,data,iter1) else: ftype = "quill" page.model.set_value(iter1,1,("quill",dirflag)) quill.parse (page,data,iter1) if infname == "Contents": if data and data[:2] == "\xe8\xac": # take signature into account ftype = "pub" page.model.set_value(iter1,1,("pub",dirflag)) pub.parse (page,data,iter1) if infname == "VisioDocument": ftype = "vsd" page.model.set_value(iter1,1,("vsd",dirflag)) # level = 1? # choose vsd or vsd2 vsd.parse (page, data, iter1) if infname == "PageMaker": ftype = "pm" page.model.set_value(iter1,1,("pm",dirflag)) pm6.open (page, data, iter1) if infname == "WordDocument": ftype = "doc" page.model.set_value(iter1,1,("doc",dirflag)) #level = 1 doc.parse (page, data, iter1) if infname == "1Table" or infname == "0Table": page.wtable = iter1 if infname == "Data": page.wdata = iter1 if infname == "Book" or infname == "Workbook": page.model.set_value(iter1,1,("xls",dirflag)) ftype = xls.parse (page, data, iter1) if infname == "PowerPoint Document" or infname == "Pictures" and data != None: ftype = "ppt" page.model.set_value(iter1,1,("ppt",dirflag)) ppt.parse (page, data, iter1) if infname == "NativeContent_MAIN": ftype = "qpw" page.model.set_value(iter1,1,("qpw",dirflag)) qpw.parse (page, data, iter1) if infname == "Signature" and data[:4] == '\x60\x67\x01\x00': ftype = "ppp" #PagePlus OLE version (9.x?) if (infname == "contents" or infname == "SCFFPreview") and ftype == "ppp": ppp.parse(page,data,iter1,infname) # I've no idea if this is really the signature, but it is # present in all files I've seen so far if infname == "Header" and data[0xc:0xf] == 'xV4': ftype = 'zmf' zmf.zmf2_open(page, data, iter1, infname) if infname[-4:] == '.zmf': ftype = 'zmf' zmf.zmf2_open(page, data, iter1, infname) if infname == "VBA": page.type = ftype ftype = "vba" if ftype == "vba" and infname == "dir": page.model.set_value(iter1,1,("vba",dirflag)) vbaiter = iter1 vbadata = data if (infile.num_children()>0): page.model.set_value(iter1,1,(ftype,1)) gsf_get_children(page,infchild,iter1,ftype,0) if "SummaryInformation" in infname: page.model.set_value(iter1,1,("ole","propset")) if vbaiter != None: vba.parse (page, vbadata, vbaiter) return ftype