def __getImageDocumentList(self): """ Extract the images from a document and return a list of Grain instances """ image_list=[] #get the elements in the tags draw:image, where the image references are kept tag_images = self.__parseContent.getElementsByTagName('draw:image') #checks if an image element exists if len(tag_images): for item in tag_images: name=None if item.hasAttribute("xlink:href"): path=item.getAttribute('xlink:href') #checks if the path is empty if "Pictures" in path: #remove the file extension name=path.replace("Pictures/","") elif "ObjectReplacements" in path: name=path.replace("./ObjectReplacements/","") # removes the "./" of the path that could be "./ObjectReplacements/Object 2" path = path.replace("./","") # happens when it has an image from a website elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path): continue if name is not None: #checks the image extension f, e = os.path.splitext(name) if e.lower() in ['.png','.gif','.jpg']: # verifies if the image is already in the list if not name in [image.getId() for image in image_list]: parent = item.parentNode nChild = parent.nextSibling objGran = Grain(graintype='image') if nChild: text=[] caption = '' if nChild.nodeType is nChild.TEXT_NODE: text.append(nChild.data) for t in self.__getTextChildNodesImage(nChild,text): if t is not None: caption+=t objGran.setCaption(caption) imagefile = StringIO(self.__zipFile.read(path)) objGran.setId(name) objGran.setContent(imagefile) image_list.append(objGran) if image_list: return image_list else: return []
try: converterObj = ExecuteConverter.ExecuteConverter() converterObj.extractTables(pdfFile, outputXMLFolder) tableListStr = converterObj.getTableList() except Exception, e: return tableList i = 0 for table in tableListStr: # generate table name i += 1 tableId = "Table" + str(i) + ".html" # finally, the Grain is created en added to the list grainObj = Grain(graintype='table') grainObj.setId(tableId) grainObj.setContent(StringIO(table)) grainObj.setMimetype("text/html") tableList.append(grainObj) return tableList ### Public Methods ### def getThumbnailsDocument(self): """ Extracts the metadata from pdf files using 'convert' tool """ os.system( 'evince-thumbnailer -s 128 "' + os.path.join(self.__pathFolder, self.Document.getFilename()) + '" ' + self.__pathFolder + '/thumbnail.png')
try: converterObj = ExecuteConverter.ExecuteConverter() converterObj.extractTables(pdfFile, outputXMLFolder) tableListStr = converterObj.getTableList() except Exception, e: return tableList i = 0 for table in tableListStr: # generate table name i+=1 tableId = "Table" + str(i) + ".html" # finally, the Grain is created en added to the list grainObj = Grain(graintype='table') grainObj.setId(tableId) grainObj.setContent(StringIO(table)) grainObj.setMimetype("text/html") tableList.append(grainObj) return tableList ### Public Methods ### def getThumbnailsDocument(self): """ Extracts the metadata from pdf files using 'convert' tool """ os.system('evince-thumbnailer -s 128 "' + os.path.join(self.__pathFolder,self.Document.getFilename()) + '" ' + self.__pathFolder +'/thumbnail.png') file_content = StringIO(open(self.__pathFolder +'/thumbnail.png').read()) os.remove(self.__pathFolder + '/thumbnail.png') return file_content
def __getTableDocumentList(self): """ Extract the tables from a document and return a list of Grain instances """ table_list=[] # create an empty template template_str=self.__createNewOOoDocument() tables= self.__parseContent.getElementsByTagName('table:table') stylesDoc= self.__parseContent.getElementsByTagName('style:style') for t in tables: styles = self.__getAttributesR(t) table_name = t.getAttribute('table:name') imgHrefs=[] for img in t.getElementsByTagName("draw:image"): if img.hasAttribute("xlink:href"): path=img.getAttribute('xlink:href') #checks if the path is empty if "ObjectReplacements" in path: # remove th "./" of the path that could be "./ObjectReplacements/Object 2" imgHrefs.append(path.replace("./","")) # happens when it has an image from a website elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path): continue else: imgHrefs.append(path) # extract legend objGran = Grain(graintype='table') leg=[] p = t.previousSibling n = t.nextSibling if p is not None: if p.hasChildNodes(): legenda = '' for i in self.__getTextChildNodesTable(p,text=[]): legenda+=i leg.append(legenda) else: leg.append(self.__getNodeText(p)) if n is not None: if n.hasChildNodes(): legenda = '' for j in self.__getTextChildNodesTable(n,text=[]): legenda+=j leg.append(legenda) else: leg.append(self.__getNodeText(n)) # join the strings to make a single legend caption = ' '.join([ i for i in leg if i is not None]) objGran.setCaption(caption) # Creating an empty File table_name = t.getAttribute('table:name') new_table = StringIO() new_table.write(template_str) template_odt = zipfile.PyZipFile(new_table,'a') doc = parseString(template_odt.read('content.xml')) template_odt.close() office_text=doc.getElementsByTagName('office:text') office_text=office_text[0] # copy the table node from a document to a new table grain newTableNo=doc.importNode(t,True) office_text.appendChild(newTableNo) for sty in stylesDoc: if (sty.getAttribute('style:name') in styles): office_automatic_styles=doc.getElementsByTagName('office:automatic-styles') office_automatic_styles=office_automatic_styles[0] office_automatic_styles.appendChild(doc.importNode(sty,True)) if imgHrefs: for image in imgHrefs: template_odt = zipfile.PyZipFile(new_table,'a') template_odt.writestr(str(image),self.__zipFile.read(image)) template_odt.close() template_odt = zipfile.PyZipFile(new_table,'a') template_odt.writestr('content.xml',doc.toxml().encode('utf-8')) template_odt.close() if table_name: #objGran.setId(plone_utils.normalizeString(table_name)) objGran.setId(table_name) objGran.setContent(new_table) objGran.setMimetype("application/vnd.oasis.opendocument.text") table_list.append(objGran) if table_list: return table_list else: return []