def extract_text(self,data) -> str: text = [] for i, v in enumerate(data["Packed"]["Files"]): if v["Name"].lower() == "document.xml": tree = cetXML(open(v["Path"],"rb").read()) for par in tree.iter(self.para): text.append(''.join(node.text for node in par.iter(self.text))) return '\n'.join(text)
def extract_dde(self, data) -> str: ''' Extract dde ''' text = [] for index, temp_var in enumerate(data["Packed"]["Files"]): if temp_var["Name"].lower() == "document.xml": tree = cetXML(open(temp_var["Path"], "rb").read()) for par in tree.iter(self.para): string = ''.join(node.text for node in par.iter(self.instrtext)) if len(string) > 0: text.append(string) return '\n'.join(text)
def office_meta_info(self,data) -> dict: ''' get office meta data ''' _dict = {} corePropNS = '{http://schemas.openxmlformats.org/package/2006/metadata/core-properties}' meta = ["filename","title","subject","creator","keywords","description","lastModifiedBy","revision","modified","created"] for i, v in enumerate(data["Packed"]["Files"]): if v["Name"].lower() == "core.xml": tree = cetXML(open(v["Path"],"rb").read()) for item in meta: x = tree.find("{}{}".format(corePropNS,item)) if x is not None: _dict.update({item:x.text}) break return _dict