def extract(self, data, dependency_results): xml_root = dependency_results[interfaces.FullTextTEIExtractor].xml_result body_node = xml_root.find('./text/body') if body_node is None: return RunnableError('Could not find body text in TEI xml file') xml_string = ET.tostring(body_node).decode('utf-8') plain_text = utils.xml_to_plain_text(xml_string) plain_text = plain_text.encode('utf-8') files = {'.txt': plain_text} return ExtractorResult(xml_result=None, files=files)
def extract(self, data, dependency_results): xml_root = dependency_results[ interfaces.FullTextTEIExtractor].xml_result body_node = xml_root.find('./text/body') if body_node is None: return RunnableError('Could not find body text in TEI xml file') xml_string = ET.tostring(body_node).decode('utf-8') plain_text = utils.xml_to_plain_text(xml_string) plain_text = plain_text.encode('utf-8') files = {'.txt': plain_text} return ExtractorResult(xml_result=None, files=files)
def extract(self, data, dep_results): tei_root = dep_results[interfaces.HeaderTEIExtractor].xml_result result_root = ET.Element('algorithm', { 'name': 'Grobid Header Extraction', 'version': '0.1' }) # Retrieve title from TEI doc title = tei_root.find('./teiHeader//titleStmt/title') if title is not None: ET.SubElement(result_root, 'title').text = title.text else: raise RunnableError('No title found in TEI document') # Find document-level affiliations affiliations = tei_root.findall( './teiHeader//sourceDesc/biblStruct/analytic/affiliation') if affiliations: affiliation_str = " | ".join( map(_get_affiliation_str, affiliations)) ET.SubElement(result_root, 'affiliation').text = affiliation_str # Retreive author names from TEI doc authors = tei_root.findall('./teiHeader//biblStruct//author') authors_node = ET.SubElement(result_root, 'authors') if authors is not None and len(authors): for author in authors: author_node = ET.SubElement(authors_node, 'author') # Find and output name-related info name_tags = [] name_tags.extend(author.findall("./persName/forename")) name_tags.extend(author.findall('./persName/surname')) name_parts = [ name.text for name in name_tags if name is not None ] name = ' '.join(name_parts) ET.SubElement(author_node, 'name').text = name # Find and output affilliation-related info affiliations = author.findall('./affiliation') if affiliations: # Use a pipe to delimit seperate affiliations affiliation_str = " | ".join( map(_get_affiliation_str, affiliations)) ET.SubElement(author_node, 'affiliation').text = affiliation_str else: self.log('No authors found') # Retreive keywords from TEI doc keywords = tei_root.findall('./teiHeader//keywords//item/term') keywords_node = ET.SubElement(result_root, 'keywords') if keywords is not None and len(keywords): for term in keywords: ET.SubElement(keywords_node, 'keyword').text = term.text else: self.log('No keywords found') # Try and find an abstract divs = tei_root.findall('./text//div') abstracts = [div for div in divs if div.get('type') == 'abstract'] if abstracts: abstract = abstracts[0] xml_string = ET.tostring(abstract) remove_heading = re.compile(r'\s*<head.*?>.*?<\s*/\s*head>', re.DOTALL | re.UNICODE) xml_string = remove_heading.sub('', xml_string) abstract_string = utils.xml_to_plain_text(xml_string) ET.SubElement(result_root, 'abstract').text = abstract_string else: self.log('No abstract found') # CSX style xml document of header information return ExtractorResult(xml_result=result_root)
def extract(self, data, dep_results): tei_root = dep_results[interfaces.HeaderTEIExtractor].xml_result result_root = ET.Element('algorithm', {'name': 'Grobid Header Extraction', 'version': '0.1'}) # Retrieve title from TEI doc title = tei_root.find('./teiHeader//titleStmt/title') if title is not None: ET.SubElement(result_root, 'title').text = title.text else: raise RunnableError('No title found in TEI document') # Find document-level affiliations affiliations = tei_root.findall('./teiHeader//sourceDesc/biblStruct/analytic/affiliation') if affiliations: affiliation_str = " | ".join(map(_get_affiliation_str, affiliations)) ET.SubElement(result_root, 'affiliation').text = affiliation_str # Retreive author names from TEI doc authors = tei_root.findall('./teiHeader//biblStruct//author') authors_node = ET.SubElement(result_root, 'authors') if authors is not None and len(authors): for author in authors: author_node = ET.SubElement(authors_node, 'author') # Find and output name-related info name_tags = [] name_tags.extend(author.findall("./persName/forename")) name_tags.extend(author.findall('./persName/surname')) name_parts = [name.text for name in name_tags if name is not None] name = ' '.join(name_parts) ET.SubElement(author_node, 'name').text = name # Find and output affilliation-related info affiliations = author.findall('./affiliation') if affiliations: # Use a pipe to delimit seperate affiliations affiliation_str = " | ".join(map(_get_affiliation_str, affiliations)) ET.SubElement(author_node, 'affiliation').text = affiliation_str else: self.log('No authors found') # Retreive keywords from TEI doc keywords = tei_root.findall('./teiHeader//keywords//item/term') keywords_node = ET.SubElement(result_root, 'keywords') if keywords is not None and len(keywords): for term in keywords: ET.SubElement(keywords_node, 'keyword').text = term.text else: self.log('No keywords found') # Try and find an abstract divs = tei_root.findall('./text//div') abstracts = [div for div in divs if div.get('type') == 'abstract'] if abstracts: abstract = abstracts[0] xml_string = ET.tostring(abstract) remove_heading = re.compile(r'\s*<head.*?>.*?<\s*/\s*head>', re.DOTALL | re.UNICODE) xml_string = remove_heading.sub('', xml_string) abstract_string = utils.xml_to_plain_text(xml_string) ET.SubElement(result_root, 'abstract').text = abstract_string else: self.log('No abstract found') # CSX style xml document of header information return ExtractorResult(xml_result=result_root)