示例#1
0
   def extract(self, data, dependency_results):
      xml_root = dependency_results[interfaces.FullTextTEIExtractor].xml_result
      body_node = xml_root.find('./text/body')

      if body_node is None:
         return RunnableError('Could not find body text in TEI xml file')

      xml_string = ET.tostring(body_node).decode('utf-8')

      plain_text = utils.xml_to_plain_text(xml_string)

      plain_text = plain_text.encode('utf-8')
      files = {'.txt': plain_text}

      return ExtractorResult(xml_result=None, files=files)
示例#2
0
    def extract(self, data, dependency_results):
        xml_root = dependency_results[
            interfaces.FullTextTEIExtractor].xml_result
        body_node = xml_root.find('./text/body')

        if body_node is None:
            return RunnableError('Could not find body text in TEI xml file')

        xml_string = ET.tostring(body_node).decode('utf-8')

        plain_text = utils.xml_to_plain_text(xml_string)

        plain_text = plain_text.encode('utf-8')
        files = {'.txt': plain_text}

        return ExtractorResult(xml_result=None, files=files)
示例#3
0
    def extract(self, data, dep_results):
        tei_root = dep_results[interfaces.HeaderTEIExtractor].xml_result
        result_root = ET.Element('algorithm', {
            'name': 'Grobid Header Extraction',
            'version': '0.1'
        })

        # Retrieve title from TEI doc
        title = tei_root.find('./teiHeader//titleStmt/title')
        if title is not None:
            ET.SubElement(result_root, 'title').text = title.text
        else:
            raise RunnableError('No title found in TEI document')

        # Find document-level affiliations
        affiliations = tei_root.findall(
            './teiHeader//sourceDesc/biblStruct/analytic/affiliation')
        if affiliations:
            affiliation_str = " | ".join(
                map(_get_affiliation_str, affiliations))
            ET.SubElement(result_root, 'affiliation').text = affiliation_str

        # Retreive author names from TEI doc
        authors = tei_root.findall('./teiHeader//biblStruct//author')
        authors_node = ET.SubElement(result_root, 'authors')
        if authors is not None and len(authors):
            for author in authors:
                author_node = ET.SubElement(authors_node, 'author')

                # Find and output name-related info
                name_tags = []
                name_tags.extend(author.findall("./persName/forename"))
                name_tags.extend(author.findall('./persName/surname'))

                name_parts = [
                    name.text for name in name_tags if name is not None
                ]
                name = ' '.join(name_parts)
                ET.SubElement(author_node, 'name').text = name

                # Find and output affilliation-related info
                affiliations = author.findall('./affiliation')
                if affiliations:
                    # Use a pipe to delimit seperate affiliations
                    affiliation_str = " | ".join(
                        map(_get_affiliation_str, affiliations))
                    ET.SubElement(author_node,
                                  'affiliation').text = affiliation_str

        else:
            self.log('No authors found')

        # Retreive keywords from TEI doc
        keywords = tei_root.findall('./teiHeader//keywords//item/term')
        keywords_node = ET.SubElement(result_root, 'keywords')
        if keywords is not None and len(keywords):
            for term in keywords:
                ET.SubElement(keywords_node, 'keyword').text = term.text
        else:
            self.log('No keywords found')

        # Try and find an abstract
        divs = tei_root.findall('./text//div')
        abstracts = [div for div in divs if div.get('type') == 'abstract']
        if abstracts:
            abstract = abstracts[0]
            xml_string = ET.tostring(abstract)
            remove_heading = re.compile(r'\s*<head.*?>.*?<\s*/\s*head>',
                                        re.DOTALL | re.UNICODE)
            xml_string = remove_heading.sub('', xml_string)
            abstract_string = utils.xml_to_plain_text(xml_string)

            ET.SubElement(result_root, 'abstract').text = abstract_string
        else:
            self.log('No abstract found')

        # CSX style xml document of header information
        return ExtractorResult(xml_result=result_root)
示例#4
0
   def extract(self, data, dep_results):
      tei_root = dep_results[interfaces.HeaderTEIExtractor].xml_result
      result_root = ET.Element('algorithm', {'name': 'Grobid Header Extraction', 'version': '0.1'})

      # Retrieve title from TEI doc
      title = tei_root.find('./teiHeader//titleStmt/title')
      if title is not None:
         ET.SubElement(result_root, 'title').text = title.text
      else:
         raise RunnableError('No title found in TEI document')

      # Find document-level affiliations
      affiliations = tei_root.findall('./teiHeader//sourceDesc/biblStruct/analytic/affiliation')
      if affiliations:
         affiliation_str = " | ".join(map(_get_affiliation_str, affiliations))
         ET.SubElement(result_root, 'affiliation').text = affiliation_str
         

      # Retreive author names from TEI doc
      authors = tei_root.findall('./teiHeader//biblStruct//author')
      authors_node = ET.SubElement(result_root, 'authors')
      if authors is not None and len(authors):
         for author in authors:
            author_node = ET.SubElement(authors_node, 'author')

            # Find and output name-related info
            name_tags = []
            name_tags.extend(author.findall("./persName/forename"))
            name_tags.extend(author.findall('./persName/surname'))

            name_parts = [name.text for name in name_tags if name is not None]
            name = ' '.join(name_parts)
            ET.SubElement(author_node, 'name').text = name

            # Find and output affilliation-related info
            affiliations = author.findall('./affiliation')
            if affiliations:
               # Use a pipe to delimit seperate affiliations
               affiliation_str = " | ".join(map(_get_affiliation_str, affiliations))
               ET.SubElement(author_node, 'affiliation').text = affiliation_str

      else:
         self.log('No authors found')


      # Retreive keywords from TEI doc
      keywords = tei_root.findall('./teiHeader//keywords//item/term')
      keywords_node = ET.SubElement(result_root, 'keywords')
      if keywords is not None and len(keywords):
         for term in keywords:
            ET.SubElement(keywords_node, 'keyword').text = term.text
      else:
         self.log('No keywords found')

      # Try and find an abstract
      divs = tei_root.findall('./text//div')
      abstracts = [div for div in divs if div.get('type') == 'abstract']
      if abstracts:
         abstract = abstracts[0]
         xml_string = ET.tostring(abstract)
         remove_heading = re.compile(r'\s*<head.*?>.*?<\s*/\s*head>', re.DOTALL | re.UNICODE)
         xml_string = remove_heading.sub('', xml_string)
         abstract_string = utils.xml_to_plain_text(xml_string)

         ET.SubElement(result_root, 'abstract').text = abstract_string
      else:
         self.log('No abstract found')


      # CSX style xml document of header information
      return ExtractorResult(xml_result=result_root)