def process_paper(model_name, pmid): json_path = os.path.join(model_name, 'jsons', 'PMID%s.json' % pmid) if pmid.startswith('api') or pmid.startswith('PMID'): logger.warning('Invalid PMID: %s' % pmid) # If the paper has been read, use the json output file if os.path.exists(json_path): rp = reach.process_json_file(json_path, citation=pmid) txt_format = 'existing_json' # If the paper has not been read, download the text and read else: try: txt, txt_format = get_full_text(pmid, 'pmid') except: return None, None if txt_format == 'pmc_oa_xml': rp = reach.process_nxml_str(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) elif txt_format == 'elsevier_xml': # Extract the raw text from the Elsevier XML txt = elsevier_client.extract_text(txt) rp = reach.process_text(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) elif txt_format == 'abstract': rp = reach.process_text(txt, citation=pmid, offline=True) if os.path.exists('reach_output.json'): shutil.move('reach_output.json', json_path) else: rp = None if rp is not None: check_pmids(rp.statements) return rp, txt_format
def test_get_converted_article_body(): """Make sure we can get fulltext of an article that has ja:converted-article as its principal sub-element.""" # PMID: 11851341 doi = '10.1006/jmbi.2001.5334' xml_str = ec.download_article(doi) body = ec.extract_text(xml_str) assert body
def test_get_rawtext(): """Make sure we can get content of an article that has content in xocs:rawtext""" # PMID: 20072652 doi = '10.1593/neo.91196' xml_str = ec.download_article(doi) body = ec.extract_text(xml_str) assert body
def test_get_rawtext(): """Make sure we can get content of an article that has content in xocs:rawtext""" # PMID: 20072652 doi = '10.1593/neo.91196' xml_str = ec.download_article(doi) body = ec.extract_text(xml_str) assert body
def test_get_converted_article_body(): """Make sure we can get fulltext of an article that has ja:converted-article as its principal sub-element.""" # PMID: 11851341 doi = '10.1006/jmbi.2001.5334' xml_str = ec.download_article(doi) body = ec.extract_text(xml_str) assert body
def get_upload_content(pmid, force_fulltext_lookup=False): """Get full text and/or abstract for paper and upload to S3.""" # Make sure that the PMID doesn't start with PMID so that it doesn't # screw up the literature clients if pmid.startswith('PMID'): pmid = pmid[4:] # First, check S3: (ft_content_s3, ft_content_type_s3) = get_full_text(pmid) # The abstract is on S3 but there is no full text; if we're not forcing # fulltext lookup, then we're done if ft_content_type_s3 == 'abstract' and not force_fulltext_lookup: return (ft_content_s3, ft_content_type_s3) # If there's nothing (even an abstract on S3), or if there's an abstract # and we're forcing fulltext lookup, do the lookup elif ft_content_type_s3 is None or \ (ft_content_type_s3 == 'abstract' and force_fulltext_lookup) or \ (ft_content_type_s3 == 'elsevier_xml' and not elsevier_client.extract_text(ft_content_s3)): # FIXME FIXME FIXME if ft_content_type_s3 == 'elsevier_xml': logger.info('elsevier_xml for %s missing full text element, ' 'getting again.' % pmid) # FIXME FIXME FIXME # Try to retrieve from literature client logger.info("PMID%s: getting content using literature client" % pmid) (ft_content, ft_content_type) = lit.get_full_text(pmid, 'pmid') assert ft_content_type in ('pmc_oa_xml', 'elsevier_xml', 'abstract', None) # If we tried to get the full text and didn't even get the abstract, # then there was probably a problem with the web service or the DOI if ft_content_type is None: return (None, None) # If we got the abstract, and we already had the abstract on S3, then # do nothing elif ft_content_type == 'abstract' and ft_content_type_s3 == 'abstract': logger.info("PMID%s: found abstract but already had it on " \ "S3; skipping" % pmid) return (ft_content, ft_content_type) # If we got the abstract, and we had nothing on S3, then upload elif ft_content_type == 'abstract' and ft_content_type_s3 is None: logger.info("PMID%s: found abstract, uploading to S3" % pmid) put_abstract(pmid, ft_content) return (ft_content, ft_content_type) # We got a full text (or something other than None or abstract...) else: logger.info("PMID%s: uploading %s" % (pmid, ft_content_type)) put_full_text(pmid, ft_content, full_text_type=ft_content_type) return (ft_content, ft_content_type) # Some form of full text is already on S3 else: # TODO # In future, could check for abstract even if full text is found, and # upload it just to have it return (ft_content_s3, ft_content_type_s3) # We should always return before we get here assert False
def process_paper(model_name, pmid): """Process a paper with the given pubmed identifier Parameters ---------- model_name : str The directory for the INDRA machine pmid : str The PMID to process. Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. txt_format : str A string representing the format of the text """ json_directory = os.path.join(model_name, 'jsons') json_path = os.path.join(json_directory, 'PMID%s.json' % pmid) if pmid.startswith('api') or pmid.startswith('PMID'): logger.warning('Invalid PMID: %s' % pmid) # If the paper has been read, use the json output file if os.path.exists(json_path): rp = reach.process_json_file(json_path, citation=pmid) txt_format = 'existing_json' # If the paper has not been read, download the text and read else: try: txt, txt_format = get_full_text(pmid, 'pmid') except Exception: return None, None if txt_format == 'pmc_oa_xml': rp = reach.process_nxml_str(txt, citation=pmid, offline=True, output_fname=json_path) elif txt_format == 'elsevier_xml': # Extract the raw text from the Elsevier XML txt = elsevier_client.extract_text(txt) rp = reach.process_text(txt, citation=pmid, offline=True, output_fname=json_path) elif txt_format == 'abstract': rp = reach.process_text(txt, citation=pmid, offline=True, output_fname=json_path) else: rp = None if rp is not None: check_pmids(rp.statements) return rp, txt_format
def test_get_rawtext(): """Make sure we can get content of an article that has content in xocs:rawtext""" # PMID: 20072652 doi = '10.1593/neo.91196' xml_str = ec.download_article(doi) body = ec.extract_text(xml_str) if not body: logger.warning('Unable to extract text from XML string:\n' '%s...' % xml_str[:2000]) assert body
def test_get_rawtext(): """Make sure we can get content of an article that has content in xocs:rawtext""" # PMID: 20072652 doi = '10.1593/neo.91196' xml_str = ec.download_article(doi) body = ec.extract_text(xml_str) if not body: logger.warning('Unable to extract text from XML string:\n' '%s...' % xml_str[:2000]) assert body
def test_get_converted_article_body(): """Make sure we can get fulltext of an article that has ja:converted-article as its principal sub-element.""" # PMID: 11851341 doi = '10.1006/jmbi.2001.5334' xml_str = ec.download_article(doi) body = ec.extract_text(xml_str) if not body: logger.warning('Unable to extract text from XML string:\n' '%s...' % xml_str[:2000]) assert body
def test_get_converted_article_body(): """Make sure we can get fulltext of an article that has ja:converted-article as its principal sub-element.""" # PMID: 11851341 doi = '10.1006/jmbi.2001.5334' xml_str = ec.download_article(doi) body = ec.extract_text(xml_str) if not body: logger.warning('Unable to extract text from XML string:\n' '%s...' % xml_str[:2000]) assert body
def get_text(): # Add timeout here for PubMed time.sleep(0.5) # full_pmid = s3_client.check_pmid(pmid) # Look for the full text content, content_type = s3_client.get_upload_content( pmid, force_fulltext_lookup=force_fulltext) content_path = None # Write the contents to a file if content_type is None or content is None: # No content found on S3, skipping content_source = 'content_not_found' elif content_type == 'pmc_oa_xml': content_source = 'pmc_oa_xml' content_path = os.path.join(input_dir, '%s.nxml' % pmid) elif content_type == 'pmc_auth_xml': content_source = 'pmc_auth_xml' content_path = os.path.join(input_dir, '%s.nxml' % pmid) elif content_type == 'pmc_oa_txt': content_source = 'pmc_oa_txt' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'elsevier_xml': content = elsevier_client.extract_text(content) # Couldn't get text from Elsevier XML if content is None: content_source = 'elsevier_extract_text_failure' else: content_source = 'elsevier_xml' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'txt': content_source = 'txt' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'abstract': content_source = 'abstract' content_path = os.path.join(input_dir, '%s.txt' % pmid) # Unhandled content type, skipping else: content_source = 'unhandled_content_type_%s' % content_type # If we got content, write the content to a file with the appropriate # extension if content_path: with open(content_path, 'wb') as f: # The XML string is Unicode enc = content.encode('utf-8') f.write(enc) # Return dict of results for this PMID result = { pmid: { 'content_source': content_source, 'content_path': content_path } } return result
def process_paper(model_name, pmid): """Process a paper with the given pubmed identifier Parameters ---------- model_name : str The directory for the INDRA machine pmid : str The PMID to process. Returns ------- rp : ReachProcessor A ReachProcessor containing the extracted INDRA Statements in rp.statements. txt_format : str A string representing the format of the text """ json_directory = os.path.join(model_name, 'jsons') json_path = os.path.join(json_directory, 'PMID%s.json' % pmid) if pmid.startswith('api') or pmid.startswith('PMID'): logger.warning('Invalid PMID: %s' % pmid) # If the paper has been read, use the json output file if os.path.exists(json_path): rp = reach.process_json_file(json_path, citation=pmid) txt_format = 'existing_json' # If the paper has not been read, download the text and read else: try: txt, txt_format = get_full_text(pmid, 'pmid') except Exception: return None, None if txt_format == 'pmc_oa_xml': rp = reach.process_nxml_str(txt, citation=pmid, offline=True, output_fname=json_path) elif txt_format == 'elsevier_xml': # Extract the raw text from the Elsevier XML txt = elsevier_client.extract_text(txt) rp = reach.process_text(txt, citation=pmid, offline=True, output_fname=json_path) elif txt_format == 'abstract': rp = reach.process_text(txt, citation=pmid, offline=True, output_fname=json_path) else: rp = None if rp is not None: check_pmids(rp.statements) return rp, txt_format
def on_read(b): global articles global statements raw_txt = elsevier_client.extract_text(articles[int(paper_id.value)]) if 'Internal Server Error' in raw_txt: print('Sorry, that paper was not accessible for reading.') statements = [] ep = eidos.process_text(raw_txt, webservice='http://localhost:5000') statements = ep.statements print('We extracted %d statements:' % len(statements)) for stmt in statements: sg = stmt.subj.db_refs['UN'][0][0].split('/')[-1] og = stmt.obj.db_refs['UN'][0][0].split('/')[-1] printmd('* **%s**(%s) %s **%s**(%s)' % (sg, stmt.subj.name, '->' if stmt.overall_polarity() == 1 else '-|', og, stmt.obj.name))
def read_piis(piis): """Return texts extracted from articles with given PIIs. Parameters ---------- piis : list[str] A list of PIIs to extract texts from. Returns ------- texts : dict A dictionary representing PIIs as keys and extracted texts as values. """ texts = {} for pii in piis: try: xml = elsevier_client.download_article(pii, id_type='pii') # If we got an empty xml or bad response if not xml: logger.info('Could not get article content for %s' % pii) continue # Handle Connection and other errors except Exception as e: logger.info('Could not get article content for %s because of %s' % (pii, e)) continue try: txt = elsevier_client.extract_text(xml) # If we could find relevant xml parts if not txt: logger.info('Could not extract article text for %s' % pii) continue # Handle Connection and other errors except Exception as e: logger.info('Could not extract article text for %s because of %s' % (pii, e)) texts[pii] = txt logger.info('Got text back for %d articles.' % len(texts)) return texts
print('Got %d PIIs' % len(all_piis)) # Download all the XML content for pii in all_piis: fname = 'xml/%s.xml' % pii.replace('/', '_') if not os.path.exists(fname): print('Donwloading %s' % pii) res = elsevier_client.download_article(pii, 'pii') with open(fname, 'wb') as fh: fh.write(res.encode('utf-8')) else: print('Cached %s' % pii) # Strip out the text from all the XML content for pii in all_piis: fname = 'xml/%s.xml' % pii.replace('/', '_') with open(fname, 'rb') as fh: xml_content = fh.read().decode('utf-8') txt = elsevier_client.extract_text(xml_content) if not txt: continue txt_fname = 'txt/%s.txt' % pii.replace('/', '_') with open(txt_fname, 'wb') as fh: fh.write(txt.encode('utf-8')) # Now run Eidos on the documents in_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'txt') out_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'jsonld') eidos_cli.extract_from_directory(in_folder, out_folder)
logger.info('No content found on S3 for %s, skipping' % pmid) continue elif content_type == 'pmc_oa_xml': num_pmc_oa_xml += 1 text_sources[full_pmid] = 'pmc_oa_xml' content_path = os.path.join(input_dir, '%s.nxml' % pmid) elif content_type == 'pmc_auth_xml': num_pmc_auth_xml += 1 text_sources[full_pmid] = 'pmc_auth_xml' content_path = os.path.join(input_dir, '%s.nxml' % pmid) elif content_type == 'pmc_oa_txt': num_txt += 1 text_sources[full_pmid] = 'pmc_oa_txt' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'elsevier_xml': content = elsevier_client.extract_text(content) if content is None: logger.info("%s: Couldn't get text from Elsevier XML" % pmid) num_elsevier_xml_fail += 1 continue num_elsevier_xml += 1 text_sources[full_pmid] = 'elsevier_xml' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'txt': num_txt += 1 text_sources[full_pmid] = 'txt' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'abstract': num_abstract += 1 text_sources[full_pmid] = 'abstract' content_path = os.path.join(input_dir, '%s.txt' % pmid)
def test_article(): # PMID: 11302724 doi = '10.1006/bbrc.2001.4693' xml_str = ec.download_article(doi) body = ec.extract_text(xml_str) assert body is None
def test_article(): # PMID: 11302724 doi = '10.1006/bbrc.2001.4693' xml_str = ec.download_article(doi) body = ec.extract_text(xml_str) assert body is None
logger.info('No content found on S3 for %s, skipping' % pmid) continue elif content_type == 'pmc_oa_xml': num_pmc_oa_xml += 1 text_sources[full_pmid] = 'pmc_oa_xml' content_path = os.path.join(input_dir, '%s.nxml' % pmid) elif content_type == 'pmc_auth_xml': num_pmc_auth_xml += 1 text_sources[full_pmid] = 'pmc_auth_xml' content_path = os.path.join(input_dir, '%s.nxml' % pmid) elif content_type == 'pmc_oa_txt': num_txt += 1 text_sources[full_pmid] = 'pmc_oa_txt' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'elsevier_xml': content = elsevier_client.extract_text(content) if content is None: logger.info("%s: Couldn't get text from Elsevier XML" % pmid) num_elsevier_xml_fail += 1 continue num_elsevier_xml += 1 text_sources[full_pmid] = 'elsevier_xml' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'txt': num_txt += 1 text_sources[full_pmid] = 'txt' content_path = os.path.join(input_dir, '%s.txt' % pmid) elif content_type == 'abstract': num_abstract += 1 text_sources[full_pmid] = 'abstract' content_path = os.path.join(input_dir, '%s.txt' % pmid)