示例#1
0
            converterObj = ExecuteConverter.ExecuteConverter()
            converterObj.extractTables(pdfFile, outputXMLFolder)
            tableListStr = converterObj.getTableList()
        except Exception, e:
            return tableList

        i = 0
        for table in tableListStr:
            # generate table name
            i+=1
            tableId = "Table" + str(i) + ".html"
            # finally, the Grain is created en added to the list
            grainObj = Grain(graintype='table')
            grainObj.setId(tableId)
            grainObj.setContent(StringIO(table))
            grainObj.setMimetype("text/html")
            tableList.append(grainObj)

        return tableList

    ### Public Methods ###

    def getThumbnailsDocument(self):
        """
            Extracts the metadata from pdf files using 'convert' tool
        """
        os.system('evince-thumbnailer -s 128 "' + os.path.join(self.__pathFolder,self.Document.getFilename())  + '" ' + self.__pathFolder +'/thumbnail.png')
	file_content = StringIO(open(self.__pathFolder +'/thumbnail.png').read())
	os.remove(self.__pathFolder + '/thumbnail.png')
        return file_content
示例#2
0
            converterObj = ExecuteConverter.ExecuteConverter()
            converterObj.extractTables(pdfFile, outputXMLFolder)
            tableListStr = converterObj.getTableList()
        except Exception, e:
            return tableList

        i = 0
        for table in tableListStr:
            # generate table name
            i += 1
            tableId = "Table" + str(i) + ".html"
            # finally, the Grain is created en added to the list
            grainObj = Grain(graintype='table')
            grainObj.setId(tableId)
            grainObj.setContent(StringIO(table))
            grainObj.setMimetype("text/html")
            tableList.append(grainObj)

        return tableList

    ### Public Methods ###

    def getThumbnailsDocument(self):
        """
            Extracts the metadata from pdf files using 'convert' tool
        """
        os.system(
            'evince-thumbnailer -s 128 "' +
            os.path.join(self.__pathFolder, self.Document.getFilename()) +
            '" ' + self.__pathFolder + '/thumbnail.png')
        file_content = StringIO(
    def __getTableDocumentList(self):
        """
            Extract the tables from a document and return a list of Grain instances
        """
        table_list=[]
        # create an empty template
        template_str=self.__createNewOOoDocument()
        tables= self.__parseContent.getElementsByTagName('table:table')
        stylesDoc= self.__parseContent.getElementsByTagName('style:style')
        for t in tables:
            styles = self.__getAttributesR(t)
            table_name = t.getAttribute('table:name')
            imgHrefs=[]
            for img in t.getElementsByTagName("draw:image"):
                if img.hasAttribute("xlink:href"):
                    path=img.getAttribute('xlink:href')
                    #checks if the path is empty
                    if "ObjectReplacements" in path:
                        # remove th "./" of the path that could be "./ObjectReplacements/Object 2"
                        imgHrefs.append(path.replace("./",""))
                    # happens when it has an image from a website
                    elif re.match("^http://.+[\.jpg|\.png|\.gif]$",path):
                        continue
                    else:
                        imgHrefs.append(path)

            # extract legend
            objGran = Grain(graintype='table')
            leg=[]
            p = t.previousSibling
            n = t.nextSibling
            if p is not None:
              if p.hasChildNodes():
                  legenda = ''
                  for i in self.__getTextChildNodesTable(p,text=[]):
                      legenda+=i
                  leg.append(legenda)
              else:
                  leg.append(self.__getNodeText(p))
            if n is not None:
                if n.hasChildNodes():
                    legenda = ''
                    for j in self.__getTextChildNodesTable(n,text=[]):
                        legenda+=j
                    leg.append(legenda)
                else:
                    leg.append(self.__getNodeText(n))

            # join the strings to make a single legend
            caption = ' '.join([ i for i in leg if i is not None])

            objGran.setCaption(caption)
            # Creating an empty File
            table_name = t.getAttribute('table:name')
            new_table = StringIO()
            new_table.write(template_str)
            template_odt = zipfile.PyZipFile(new_table,'a')
            doc = parseString(template_odt.read('content.xml'))
            template_odt.close()
            office_text=doc.getElementsByTagName('office:text')
            office_text=office_text[0]

            # copy the table node from a document to a new table grain
            newTableNo=doc.importNode(t,True)
            office_text.appendChild(newTableNo)

            for sty in stylesDoc:
                if (sty.getAttribute('style:name') in styles):
                    office_automatic_styles=doc.getElementsByTagName('office:automatic-styles')
                    office_automatic_styles=office_automatic_styles[0]
                    office_automatic_styles.appendChild(doc.importNode(sty,True))
            if imgHrefs:
                for image in imgHrefs:
                    template_odt = zipfile.PyZipFile(new_table,'a')
                    template_odt.writestr(str(image),self.__zipFile.read(image))
                    template_odt.close()
            template_odt = zipfile.PyZipFile(new_table,'a')
            template_odt.writestr('content.xml',doc.toxml().encode('utf-8'))
            template_odt.close()
            if table_name:
                #objGran.setId(plone_utils.normalizeString(table_name))
                objGran.setId(table_name)
                objGran.setContent(new_table)
                objGran.setMimetype("application/vnd.oasis.opendocument.text")
                table_list.append(objGran)
        if table_list:
            return table_list
        else:
            return []