示例#1
0
 def extract_text(self,data) -> str:
     text = []
     for i, v in enumerate(data["Packed"]["Files"]):
         if v["Name"].lower() == "document.xml":
             tree = cetXML(open(v["Path"],"rb").read())
             for par in tree.iter(self.para):
                 text.append(''.join(node.text for node in par.iter(self.text)))
     return '\n'.join(text)
示例#2
0
 def extract_dde(self, data) -> str:
     '''
     Extract dde
     '''
     text = []
     for index, temp_var in enumerate(data["Packed"]["Files"]):
         if temp_var["Name"].lower() == "document.xml":
             tree = cetXML(open(temp_var["Path"], "rb").read())
             for par in tree.iter(self.para):
                 string = ''.join(node.text for node in par.iter(self.instrtext))
                 if len(string) > 0:
                     text.append(string)
     return '\n'.join(text)
示例#3
0
 def office_meta_info(self,data) -> dict:
     '''
     get office meta data
     '''
     _dict = {}
     corePropNS = '{http://schemas.openxmlformats.org/package/2006/metadata/core-properties}'
     meta = ["filename","title","subject","creator","keywords","description","lastModifiedBy","revision","modified","created"]
     for i, v in enumerate(data["Packed"]["Files"]):
         if v["Name"].lower() == "core.xml":
             tree = cetXML(open(v["Path"],"rb").read())
             for item in meta:
                 x = tree.find("{}{}".format(corePropNS,item))
                 if x is not None:
                     _dict.update({item:x.text})
             break
     return _dict