def extract(self, data, dependency_results): results_dir = tempfile.mkdtemp() + '/' temp_pdf_file = extraction.utils.temp_file(data) try: command_args = [ 'java', '-jar', config.ALGORITHMS_JAR_PATH, config.ALGORITHMS_PERL_PATH, 'f', temp_pdf_file, results_dir ] status, stdout, stderr = extraction.utils.external_process( command_args, timeout=20) except subprocess.TimeoutExpired: shutil.rmtree(results_dir) raise RunnableError( 'Algorithms Jar timed out while processing document') finally: os.remove(temp_pdf_file) if status != 0: raise RunnableError('Algorithms Jar Failure. Possible error:\n' + stderr) paths = glob.glob(results_dir + '*.xml') if len(paths) != 1: raise RunnableError( 'Wrong number of results files from Algorithms Jar.') tree = safeET.parse(paths[0]) xml_root = tree.getroot() shutil.rmtree(results_dir) return ExtractorResult(xml_result=xml_root)
def extract(self, data, dependency_results): results_dir = tempfile.mkdtemp() + '/' temp_pdf_file = extraction.utils.temp_file(data) try: command_args = [config.PDFFIGURES_PATH, '-o', results_dir, '-j', results_dir, temp_pdf_file] status, stdout, stderr = extraction.utils.external_process(command_args, timeout=20) except subprocess.TimeoutExpired: shutil.rmtree(results_dir) raise RunnableError('PDFFigures timed out while processing document') finally: os.remove(temp_pdf_file) if status != 0: raise RunnableError('PDFFigures Failure. Possible error:\n' + stderr) # Handle png results files = {} for path in glob.glob(results_dir + '*.png'): # basename looks something like this: -Figure-X.png # remove the hyphen and replace with a '.', because framework will add filename prefix later filename = '.' + os.path.basename(path)[1:] with open(path, 'rb') as f: files[filename] = f.read() # Handle json results for path in glob.glob(results_dir + '*.json'): filename = '.' + os.path.basename(path)[1:] with open(path, 'r') as f: files[filename] = f.read() shutil.rmtree(results_dir) return ExtractorResult(xml_result=None, files=files)
def extract(self, data, dep_results): # Write the pdf data to a temporary location so PDFBox can process it file_path = extraction.utils.temp_file(data, suffix='.pdf') try: command_args = [ 'java', '-jar', config.PDF_BOX_JAR, 'ExtractText', '-console', '-encoding', 'UTF-8', file_path ] status, stdout, stderr = extraction.utils.external_process( command_args, timeout=30) except subprocess.TimeoutExpired: raise RunnableError('PDFBox timed out while processing document') finally: os.remove(file_path) if status != 0: raise RunnableError( 'PDFBox returned error status code {0}.\nPossible error:\n{1}'. format(status, stderr)) # We can use result from PDFBox directly, no manipulation needed pdf_plain_text = stdout files = {'.txt': pdf_plain_text} return ExtractorResult(xml_result=None, files=files)
def extract(self, data, dep_results): url = '{0}/processFulltextDocument'.format(config.GROBID_HOST) files = {'input': data} vars = {} try: resp = requests.post(url, files=files, data=vars) except requests.exceptions.RequestException as ex: raise RunnableError('Request to Grobid server failed') if resp.status_code != 200: raise RunnableError( 'Grobid returned status {0} instead of 200\nPossible Error:\n{1}' .format(resp.status_code, resp.text)) xml_text = resp.content # remove namespace info from xml string # this is hacky but makes parsing it much much nicer down the road remove_xmlns = re.compile(r'\sxmlns[^"]+"[^"]+"') xml_text = remove_xmlns.sub('', xml_text) xml = safeET.fromstring(xml_text) # grobid returns TEI xml file return ExtractorResult(xml_result=xml)
def extract(self, data, deps): emails = re.findall(r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}\b', data, re.IGNORECASE | re.UNICODE) root = ET.Element('extraction') for email in emails: ele = ET.SubElement(root, 'email') ele.text = email return ExtractorResult(xml_result=root)
def extract(self, data, deps): try: (status, stdout, stderr) = utils.external_process( ['awk', '/^[0-9]/ {print;}', '-'], input_data=data, timeout=5) except subprocess.TimeoutExpired: raise RunnableError('awk timed out') lines = [line for line in stdout.split("\n") if line] root = ET.Element('extraction') for line in lines: ele = ET.SubElement(root, 'line') ele.text = line return ExtractorResult(xml_result=root)
def extract(self, data, dependency_results): xml_root = dependency_results[ interfaces.FullTextTEIExtractor].xml_result body_node = xml_root.find('./text/body') if body_node is None: return RunnableError('Could not find body text in TEI xml file') xml_string = ET.tostring(body_node).decode('utf-8') plain_text = utils.xml_to_plain_text(xml_string) plain_text = plain_text.encode('utf-8') files = {'.txt': plain_text} return ExtractorResult(xml_result=None, files=files)
def extract(self, data, dependency_results): # Get the plain text file of the PDF and write it to a temporary location pdf_text = dependency_results[ interfaces.PlainTextExtractor].files['.txt'] text_file_path = extraction.utils.temp_file(pdf_text) # Run parscit on the text file to extract citations try: status, stdout, stderr = extraction.utils.external_process( ['perl', config.PARSCIT_PATH, text_file_path], timeout=20) except subprocess.TimeoutExpired as te: raise RunnableError('ParsCit timed out while processing document') finally: os.remove(text_file_path) if status != 0: raise RunnableError('ParsCit Failure. Possible error:\n' + stderr) # ParsCit will give us a string representing an xml doc # convert from string type into an xml object xml = safeET.fromstring(stdout) return ExtractorResult(xml_result=xml)
def extract(self, data, dep_results): file_path = utils.temp_file(data, suffix='.pdf') try: status, stdout, stderr = utils.external_process(['java', '-jar', config.PDF_BOX_JAR, 'ExtractText', '-console', '-encoding', 'UTF-8', file_path], timeout=30) except subprocess.TimeoutExpired as te: raise RunnableError('PDFBox timed out while processing document') finally: os.remove(file_path) if status != 0: raise RunnableError('PDFBox returned error status code {0}.\nPossible error:\n{1}'.format(status, stderr)) plain_text = stdout # create xml result file that just points towards the file with plain text results root=ET.Element('file') root.text = 'plain_text.txt' files = {'plain_text.txt': stdout} return ExtractorResult(xml_result=root, files=files)
def extract(self, data, dep_results): tei_root = dep_results[interfaces.HeaderTEIExtractor].xml_result result_root = ET.Element('algorithm', { 'name': 'Grobid Header Extraction', 'version': '0.1' }) # Retrieve title from TEI doc title = tei_root.find('./teiHeader//titleStmt/title') if title is not None: ET.SubElement(result_root, 'title').text = title.text else: raise RunnableError('No title found in TEI document') # Find document-level affiliations affiliations = tei_root.findall( './teiHeader//sourceDesc/biblStruct/analytic/affiliation') if affiliations: affiliation_str = " | ".join( map(_get_affiliation_str, affiliations)) ET.SubElement(result_root, 'affiliation').text = affiliation_str # Retreive author names from TEI doc authors = tei_root.findall('./teiHeader//biblStruct//author') authors_node = ET.SubElement(result_root, 'authors') if authors is not None and len(authors): for author in authors: author_node = ET.SubElement(authors_node, 'author') # Find and output name-related info name_tags = [] name_tags.extend(author.findall("./persName/forename")) name_tags.extend(author.findall('./persName/surname')) name_parts = [ name.text for name in name_tags if name is not None ] name = ' '.join(name_parts) ET.SubElement(author_node, 'name').text = name # Find and output affilliation-related info affiliations = author.findall('./affiliation') if affiliations: # Use a pipe to delimit seperate affiliations affiliation_str = " | ".join( map(_get_affiliation_str, affiliations)) ET.SubElement(author_node, 'affiliation').text = affiliation_str else: self.log('No authors found') # Retreive keywords from TEI doc keywords = tei_root.findall('./teiHeader//keywords//item/term') keywords_node = ET.SubElement(result_root, 'keywords') if keywords is not None and len(keywords): for term in keywords: ET.SubElement(keywords_node, 'keyword').text = term.text else: self.log('No keywords found') # Try and find an abstract divs = tei_root.findall('./text//div') abstracts = [div for div in divs if div.get('type') == 'abstract'] if abstracts: abstract = abstracts[0] xml_string = ET.tostring(abstract) remove_heading = re.compile(r'\s*<head.*?>.*?<\s*/\s*head>', re.DOTALL | re.UNICODE) xml_string = remove_heading.sub('', xml_string) abstract_string = utils.xml_to_plain_text(xml_string) ET.SubElement(result_root, 'abstract').text = abstract_string else: self.log('No abstract found') # CSX style xml document of header information return ExtractorResult(xml_result=result_root)
def extract(self, data, dep_results): xml = _call_grobid_method(data, 'processReferences') return ExtractorResult(xml_result=xml)
def extract(self, data, dep_results): xml = _call_grobid_method(data, 'processHeaderDocument') return ExtractorResult(xml_result=xml)
def extract(self, data, dep_results): ele = ET.Element('file') ele.text = 'test.txt' files = {'test.txt': 'test test'} return ExtractorResult(ele, files=files)
def extract(self, data, dep_results): ele = ET.Element('result') ele.text = data return ExtractorResult(ele)
def extract(self, data, dep_results): return ExtractorResult(xml_result=None)