def get_asbmb_full_text(url): # Get the location of the full text PDF from the target URL req = requests.get(url) if req.status_code != 200: logger.warning( 'ASBMB full text query returned status code %s: URL %s' % (req.status_code, url)) return (None, None) # If we're here that means that we successfully got the paper URL xml_str = req.text tree = ET.XML(xml_str, parser=UTB()) fulltext_elem = tree.find('.//{http://www.w3.org/1999/xhtml}meta' '[@name="citation_fulltext_html_url"]') # Couldn't find the element containing the full text URL if fulltext_elem is None: logger.warning("ASBMB full text: couldn't find the full text URL " "element among the meta tags.") return (None, None) fulltext_url = fulltext_elem.attrib['content'] # Now, get the full text HTML page req2 = requests.get(fulltext_url) if req2.status_code != 200: logger.warning( 'ASBMB full text query returned status code %s: URL %s' % (req.status_code, fulltext_url)) return (None, None) # We've got the full text page! # Get all the section elements xml_str2 = req2.text tree2 = ET.XML(xml_str2, parser=UTB()) return None, None
def get_dois(query_str, count=100): """Search ScienceDirect through the API for articles. See http://api.elsevier.com/content/search/fields/scidir for constructing a query string to pass here. Example: 'abstract(BRAF) AND all("colorectal cancer")' """ url = '%s/%s' % (elsevier_search_url, query_str) if elsevier_keys is None: logger.error('Missing API key at %s, could not perform search.' % api_key_file) return None params = { 'query': query_str, 'count': count, 'httpAccept': 'application/xml', 'sort': '-coverdate', 'field': 'doi' } res = requests.get(url, params) if not res.status_code == 200: return None tree = ET.XML(res.content, parser=UTB()) doi_tags = tree.findall('atom:entry/prism:doi', elsevier_ns) dois = [dt.text for dt in doi_tags] return dois
def __init__(self, xml_string): self.statements = [] # Parse XML try: self.tree = ET.XML(xml_string, parser=UTB()) except ET.ParseError: logger.error('Could not parse XML string') self.tree = None return # Get the document ID from the EKB tag. self.doc_id = self.tree.attrib.get('id') # Store all paragraphs and store all sentences in a data structure paragraph_tags = self.tree.findall('input/paragraphs/paragraph') sentence_tags = self.tree.findall('input/sentences/sentence') self.paragraphs = {p.attrib['id']: p.text for p in paragraph_tags} self.sentences = {s.attrib['id']: s.text for s in sentence_tags} self.par_to_sec = {p.attrib['id']: p.attrib.get('sec-type') for p in paragraph_tags} # Extract statements self.extract_noun_relations('CC') self.extract_noun_relations('EVENT') return
def get_xml(pmc_id): """Returns XML for the article corresponding to a PMC ID.""" if pmc_id.upper().startswith('PMC'): pmc_id = pmc_id[3:] # Request params params = {} params['verb'] = 'GetRecord' params['identifier'] = 'oai:pubmedcentral.nih.gov:%s' % pmc_id params['metadataPrefix'] = 'pmc' # Submit the request res = requests.get(pmc_url, params) if not res.status_code == 200: logger.warning("Couldn't download %s" % pmc_id) return None # Read the bytestream xml_bytes = res.content # Check for any XML errors; xml_str should still be bytes tree = ET.XML(xml_bytes, parser=UTB()) xmlns = "http://www.openarchives.org/OAI/2.0/" err_tag = tree.find('{%s}error' % xmlns) if err_tag is not None: err_code = err_tag.attrib['code'] err_text = err_tag.text logger.warning('PMC client returned with error %s: %s' % (err_code, err_text)) return None # If no error, return the XML as a unicode string else: return xml_bytes.decode('utf-8')
def __init__(self, xml_string): self.statements = [] # Parse XML try: self.tree = ET.XML(xml_string, parser=UTB()) except ET.ParseError: logger.error('Could not parse XML string') self.tree = None return # Get the document ID from the EKB tag. self.doc_id = self.tree.attrib.get('id') # Store all paragraphs and store all sentences in a data structure paragraph_tags = self.tree.findall('input/paragraphs/paragraph') sentence_tags = self.tree.findall('input/sentences/sentence') self.paragraphs = {p.attrib['id']: p.text for p in paragraph_tags} self.sentences = {s.attrib['id']: s.text for s in sentence_tags} self.par_to_sec = {p.attrib['id']: p.attrib.get('sec-type') for p in paragraph_tags} # Keep a list of unhandled events for development purposes self._unhandled_events = [] # Extract statements self.extract_noun_relations('CC') self.extract_noun_relations('EVENT') # In some EKBs we get two redundant relations over the same arguments, # we eliminate these self._remove_multi_extraction_artifacts() # Print unhandled event types logger.debug('Unhandled event types: %s' % (', '.join(sorted(list(set(self._unhandled_events))))))
def __init__(self, xml_string): self.statements = [] # Parse XML try: self.tree = ET.XML(xml_string, parser=UTB()) except ET.ParseError: logger.error('Could not parse XML string') self.tree = None return # Get the document ID from the EKB tag. self.doc_id = self.tree.attrib.get('id') # Store all paragraphs and store all sentences in a data structure paragraph_tags = self.tree.findall('input/paragraphs/paragraph') sentence_tags = self.tree.findall('input/sentences/sentence') self.paragraphs = {p.attrib['id']: p.text for p in paragraph_tags} self.sentences = {s.attrib['id']: s.text for s in sentence_tags} self.par_to_sec = { p.attrib['id']: p.attrib.get('sec-type') for p in paragraph_tags } # Keep a list of events that are part of relations and events # subsumed by other events self.relation_events = set() self.subsumed_events = set() # Keep a list of unhandled events for development purposes self._unhandled_events = set() self._preprocess_events()
def process_xml(xml_str): try: tree = ET.XML(xml_str, parser=UTB()) except ET.ParseError: logger.error('Could not parse XML string') return None sp = _process_elementtree(tree) return sp
def get_abstract(doi): """Get the abstract text of an article from Elsevier given a doi.""" xml_string = download_article(doi) if xml_string is None: return None assert isinstance(xml_string, str) xml_tree = ET.XML(xml_string.encode('utf-8'), parser=UTB()) if xml_tree is None: return None coredata = xml_tree.find('article:coredata', elsevier_ns) abstract = coredata.find('dc:description', elsevier_ns) abs_text = abstract.text return abs_text
def extract_paragraphs(xml_string): """Get paragraphs from the body of the given Elsevier xml.""" assert isinstance(xml_string, str) xml_tree = ET.XML(xml_string.encode('utf-8'), parser=UTB()) full_text = xml_tree.find('article:originalText', elsevier_ns) if full_text is None: logger.info('Could not find full text element article:originalText') return None article_body = _get_article_body(full_text) if article_body: return article_body raw_text = _get_raw_text(full_text) if raw_text: return [raw_text] return None
def send_request(url, data): try: res = requests.get(url, params=data) except requests.exceptions.Timeout as e: logger.error('PubMed request timed out') logger.error('url: %s, data: %s' % (url, data)) logger.error(e) return None except requests.exceptions.RequestException as e: logger.error('PubMed request exception') logger.error('url: %s, data: %s' % (url, data)) logger.error(e) return None if not res.status_code == 200: return None tree = ET.XML(res.content, parser=UTB()) return tree
def get_hgnc_entry(hgnc_id): """Return the HGNC entry for the given HGNC ID from the web service. Parameters ---------- hgnc_id : str The HGNC ID to be converted. Returns ------- xml_tree : ElementTree The XML ElementTree corresponding to the entry for the given HGNC ID. """ url = hgnc_url + 'hgnc_id/%s' % hgnc_id headers = {'Accept': '*/*'} res = requests.get(url, headers=headers) if not res.status_code == 200: return None xml_tree = ET.XML(res.content, parser=UTB()) return xml_tree
def extract_text(xml_string): if xml_string is None: return None #with open('/Users/johnbachman/Desktop/elsevier.xml', 'wb') as f: # f.write(xml_string.encode('utf-8')) assert isinstance(xml_string, str) # Build XML ElementTree xml_tree = ET.XML(xml_string.encode('utf-8'), parser=UTB()) # Look for full text element full_text = xml_tree.find('article:originalText', elsevier_ns) if full_text is None: logger.info('Could not find full text element article:originalText') return None article_body = _get_article_body(full_text) if article_body: return article_body raw_text = _get_raw_text(full_text) if raw_text: return raw_text #pdf = _get_pdf_attachment(full_text) #if pdf: # return pdf return None
def process_xml(xml_str): """Return processor with Statements extracted from a Sparser XML. Parameters ---------- xml_str : str The XML string obtained by reading content with Sparser, using the 'xml' output mode. Returns ------- sp : SparserXMLProcessor A SparserXMLProcessor which has extracted Statements as its statements attribute. """ try: tree = ET.XML(xml_str, parser=UTB()) except ET.ParseError as e: logger.error('Could not parse XML string') logger.error(e) return None sp = _process_elementtree(tree) return sp
def test_unicode_tree_builder(): xml = u'<html><bar>asdf</bar></html>'.encode('utf-8') xml_io = BytesIO(xml) tree = ET.parse(xml_io, parser=UTB()) bar = tree.find('.//bar') assert unicode_strs(bar)
def send_request(url, data): res = requests.get(url, params=data) if not res.status_code == 200: return None tree = ET.XML(res.content, parser=UTB()) return tree
def get_xml_file(self, xml_file): "Get the content from an xml file as an ElementTree." logger.info("Downloading %s" % (xml_file)) xml_bytes = self.get_uncompressed_bytes(xml_file, force_str=False) logger.info("Parsing XML metadata") return ET.XML(xml_bytes, parser=UTB())