def loadXML(self, filename, sentenceFilter, loadRegistries): """ read an xml file containing an abstract """ xmldoc = xml.dom.minidom.parse(filename) absNodes = xmldoc.getElementsByTagName('abstract') xmlutil.normalizeXMLTree(absNodes[0]) self.id = absNodes[0].getAttribute('id') # read journal, author, publication info nodes = absNodes[0].getElementsByTagName('PublicationInformation') if len(nodes) > 0: self.publicationInformation = publicationinfo.PublicationInfo(nodes[0]) # read title nodes = absNodes[0].getElementsByTagName('title') if len(nodes) > 0: self.titleSentences = xmlutil.parseSentences(nodes[0], self) # read affiliation nodes = absNodes[0].getElementsByTagName('affiliation') if len(nodes) > 0: self.affiliationSentences = xmlutil.parseSentences(nodes[0], self) # read abstract body text nodes = absNodes[0].getElementsByTagName('body') if len(nodes) > 0: self.__allSentences = xmlutil.parseSentences(nodes[0], self) for s in self.__allSentences: if sentenceFilter(s) == True: self.sentences.append(s) # read reports # remove links within the xml doc so GC can reclaim mem faster xmldoc.unlink() self.__buildAcronymTable()
def getXML(self, doc): """ Create an XML element with publication information """ node = doc.createElement('PublicationInformation') if self._journalNode is not None: node.appendChild(self._journalNode) node.appendChild(xmlutil.createNodeWithTextChild(doc, 'Country', self._country)) if self._authorListNode is not None: node.appendChild(self._authorListNode) if self._publicationTypeListNode is not None: node.appendChild(self._publicationTypeListNode) xmlutil.normalizeXMLTree(node) return node
def loadXML(self, filename, sentenceFilter, loadRegistries): """ read an xml file containing an abstract """ xmldoc = xml.dom.minidom.parse(filename) absNodes = xmldoc.getElementsByTagName('abstract') xmlutil.normalizeXMLTree(absNodes[0]) self.id = absNodes[0].getAttribute('id') # read journal, author, publication info nodes = absNodes[0].getElementsByTagName('PublicationInformation') if len(nodes) > 0: self.publicationInformation = publicationinfo.PublicationInfo( nodes[0]) # read title nodes = absNodes[0].getElementsByTagName('title') if len(nodes) > 0: self.titleSentences = xmlutil.parseSentences(nodes[0], self) # read affiliation nodes = absNodes[0].getElementsByTagName('affiliation') if len(nodes) > 0: self.affiliationSentences = xmlutil.parseSentences(nodes[0], self) # read abstract body text nodes = absNodes[0].getElementsByTagName('body') if len(nodes) > 0: self.__allSentences = xmlutil.parseSentences(nodes[0], self) for s in self.__allSentences: if sentenceFilter(s) == True: self.sentences.append(s) # read reports # remove links within the xml doc so GC can reclaim mem faster xmldoc.unlink() self.__buildAcronymTable()