예제 #1
0
파일: machine.py 프로젝트: djmilstein/indra
def process_paper(model_name, pmid):
    json_path = os.path.join(model_name, 'jsons', 'PMID%s.json' % pmid)

    if pmid.startswith('api') or pmid.startswith('PMID'):
        logger.warning('Invalid PMID: %s' % pmid)
    # If the paper has been read, use the json output file
    if os.path.exists(json_path):
        rp = reach.process_json_file(json_path, citation=pmid)
        txt_format = 'existing_json'
    # If the paper has not been read, download the text and read
    else:
        try:
            txt, txt_format = get_full_text(pmid, 'pmid')
        except:
            return None, None

        if txt_format == 'pmc_oa_xml':
            rp = reach.process_nxml_str(txt, citation=pmid, offline=True)
            if os.path.exists('reach_output.json'):
                shutil.move('reach_output.json', json_path)
        elif txt_format == 'elsevier_xml':
            # Extract the raw text from the Elsevier XML
            txt = elsevier_client.extract_text(txt)
            rp = reach.process_text(txt, citation=pmid, offline=True)
            if os.path.exists('reach_output.json'):
                shutil.move('reach_output.json', json_path)
        elif txt_format == 'abstract':
            rp = reach.process_text(txt, citation=pmid, offline=True)
            if os.path.exists('reach_output.json'):
                shutil.move('reach_output.json', json_path)
        else:
            rp = None
    if rp is not None:
        check_pmids(rp.statements)
    return rp, txt_format
예제 #2
0
def get_upload_content(pmid, force_fulltext_lookup=False):
    """Get full text and/or abstract for paper and upload to S3."""
    # Make sure that the PMID doesn't start with PMID so that it doesn't
    # screw up the literature clients
    if pmid.startswith('PMID'):
        pmid = pmid[4:]
    # First, check S3:
    (ft_content_s3, ft_content_type_s3) = get_full_text(pmid)
    # The abstract is on S3 but there is no full text; if we're not forcing
    # fulltext lookup, then we're done
    if ft_content_type_s3 == 'abstract' and not force_fulltext_lookup:
        return (ft_content_s3, ft_content_type_s3)
    # If there's nothing (even an abstract on S3), or if there's an abstract
    # and we're forcing fulltext lookup, do the lookup
    elif ft_content_type_s3 is None or \
            (ft_content_type_s3 == 'abstract' and force_fulltext_lookup) or \
            (ft_content_type_s3 == 'elsevier_xml' and
                    not elsevier_client.extract_text(ft_content_s3)):
        # FIXME FIXME FIXME
        if ft_content_type_s3 == 'elsevier_xml':
            logger.info('elsevier_xml for %s missing full text element, '
                        'getting again.' % pmid)
        # FIXME FIXME FIXME
        # Try to retrieve from literature client
        logger.info("PMID%s: getting content using literature client" % pmid)
        (ft_content, ft_content_type) = lit.get_full_text(pmid, 'pmid')
        assert ft_content_type in ('pmc_oa_xml', 'elsevier_xml', 'abstract',
                                   None)
        # If we tried to get the full text and didn't even get the abstract,
        # then there was probably a problem with the web service or the DOI
        if ft_content_type is None:
            return (None, None)
        # If we got the abstract, and we already had the abstract on S3, then
        # do nothing
        elif ft_content_type == 'abstract' and ft_content_type_s3 == 'abstract':
            logger.info("PMID%s: found abstract but already had it on " \
                        "S3; skipping" % pmid)
            return (ft_content, ft_content_type)
        # If we got the abstract, and we had nothing on S3, then upload
        elif ft_content_type == 'abstract' and ft_content_type_s3 is None:
            logger.info("PMID%s: found abstract, uploading to S3" % pmid)
            put_abstract(pmid, ft_content)
            return (ft_content, ft_content_type)
        # We got a full text (or something other than None or abstract...)
        else:
            logger.info("PMID%s: uploading %s" % (pmid, ft_content_type))
            put_full_text(pmid, ft_content, full_text_type=ft_content_type)
            return (ft_content, ft_content_type)
    # Some form of full text is already on S3
    else:
        # TODO
        # In future, could check for abstract even if full text is found, and
        # upload it just to have it
        return (ft_content_s3, ft_content_type_s3)
    # We should always return before we get here
    assert False
예제 #3
0
def process_paper(model_name, pmid):
    """Process a paper with the given pubmed identifier

    Parameters
    ----------
    model_name : str
        The directory for the INDRA machine
    pmid : str
        The PMID to process.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    txt_format : str
        A string representing the format of the text
    """
    json_directory = os.path.join(model_name, 'jsons')
    json_path = os.path.join(json_directory, 'PMID%s.json' % pmid)

    if pmid.startswith('api') or pmid.startswith('PMID'):
        logger.warning('Invalid PMID: %s' % pmid)
    # If the paper has been read, use the json output file
    if os.path.exists(json_path):
        rp = reach.process_json_file(json_path, citation=pmid)
        txt_format = 'existing_json'
    # If the paper has not been read, download the text and read
    else:
        try:
            txt, txt_format = get_full_text(pmid, 'pmid')
        except Exception:
            return None, None

        if txt_format == 'pmc_oa_xml':
            rp = reach.process_nxml_str(txt,
                                        citation=pmid,
                                        offline=True,
                                        output_fname=json_path)
        elif txt_format == 'elsevier_xml':
            # Extract the raw text from the Elsevier XML
            txt = elsevier_client.extract_text(txt)
            rp = reach.process_text(txt,
                                    citation=pmid,
                                    offline=True,
                                    output_fname=json_path)
        elif txt_format == 'abstract':
            rp = reach.process_text(txt,
                                    citation=pmid,
                                    offline=True,
                                    output_fname=json_path)
        else:
            rp = None
    if rp is not None:
        check_pmids(rp.statements)
    return rp, txt_format
예제 #4
0
def process_paper(model_name, pmid):
    """Process a paper with the given pubmed identifier

    Parameters
    ----------
    model_name : str
        The directory for the INDRA machine
    pmid : str
        The PMID to process.

    Returns
    -------
    rp : ReachProcessor
        A ReachProcessor containing the extracted INDRA Statements
        in rp.statements.
    txt_format : str
        A string representing the format of the text
    """
    json_directory = os.path.join(model_name, 'jsons')
    json_path = os.path.join(json_directory, 'PMID%s.json' % pmid)

    if pmid.startswith('api') or pmid.startswith('PMID'):
        logger.warning('Invalid PMID: %s' % pmid)
    # If the paper has been read, use the json output file
    if os.path.exists(json_path):
        rp = reach.process_json_file(json_path, citation=pmid)
        txt_format = 'existing_json'
    # If the paper has not been read, download the text and read
    else:
        try:
            txt, txt_format = get_full_text(pmid, 'pmid')
        except Exception:
            return None, None

        if txt_format == 'pmc_oa_xml':
            rp = reach.process_nxml_str(txt, citation=pmid, offline=True,
                                        output_fname=json_path)
        elif txt_format == 'elsevier_xml':
            # Extract the raw text from the Elsevier XML
            txt = elsevier_client.extract_text(txt)
            rp = reach.process_text(txt, citation=pmid, offline=True,
                                    output_fname=json_path)
        elif txt_format == 'abstract':
            rp = reach.process_text(txt, citation=pmid, offline=True,
                                    output_fname=json_path)
        else:
            rp = None
    if rp is not None:
        check_pmids(rp.statements)
    return rp, txt_format
예제 #5
0
파일: machine.py 프로젝트: djmilstein/indra
def process_paper_aws(pmid, start_time_local):
    try:
        metadata, content_type = get_full_text(pmid, metadata=True)
    except Exception as e:
        logger.error('Could not get content from S3: %s' % e)
        return None, None
    logger.info('Downloading %s output from AWS' % pmid)
    reach_json_str = get_reader_json_str('reach', pmid)
    if not reach_json_str:
        logger.info('Could not get output.')
        return None, content_type
    rp = reach.process_json_str(reach_json_str)

    current_time_local = datetime.datetime.now(tzlocal.get_localzone())
    dt_script = current_time_local - start_time_local
    last_mod_remote = metadata['LastModified']
    dt = (current_time_local - last_mod_remote)
    # If it was not modified since the script started
    if dt > dt_script:
        content_type = 'existing_json'
    return rp, content_type
예제 #6
0
def process_paper_aws(pmid, start_time_local):
    try:
        metadata, content_type = get_full_text(pmid, metadata=True)
    except Exception as e:
        logger.error('Could not get content from S3: %s' % e)
        return None, None
    logger.info('Downloading %s output from AWS' % pmid)
    reach_json_str = get_reader_json_str('reach', pmid)
    if not reach_json_str:
        logger.info('Could not get output.')
        return None, content_type
    rp = reach.process_json_str(reach_json_str)

    current_time_local = datetime.datetime.now(tzlocal.get_localzone())
    dt_script = current_time_local - start_time_local
    last_mod_remote = metadata['LastModified']
    dt = (current_time_local - last_mod_remote)
    # If it was not modified since the script started
    if dt > dt_script:
        content_type = 'existing_json'
    return rp, content_type
예제 #7
0
def test_get_full_text_pmc():
    txt, txt_format = get_full_text('PMC4322985', 'pmcid')
    assert (txt_format == 'pmc_oa_xml')
    assert (len(txt) > 300000)
    assert unicode_strs((txt, txt_format))
예제 #8
0
def test_get_full_text_pubmed_abstract():
    # DOI lookup in CrossRef fails for this one because of page mismatch
    txt, txt_format = get_full_text('27075779', 'pmid')
    assert (txt_format == 'abstract')
    assert (len(txt) > 800)
    assert unicode_strs((txt, txt_format))
예제 #9
0
def test_get_full_text_doi():
    txt, txt_format = get_full_text('10.18632/oncotarget.2555', 'doi')
    assert (txt_format == 'pmc_oa_xml')
    assert (len(txt) > 300000)
    assert unicode_strs((txt, txt_format))
예제 #10
0
def test_gene_network():
    # Chunk 1: this is tested in _get_gene_network_stmts
    # from indra.tools.gene_network import GeneNetwork
    # gn = GeneNetwork(['H2AX'])
    # biopax_stmts = gn.get_biopax_stmts()
    # bel_stmts = gn.get_bel_stmts()

    # Chunk 2
    from indra import literature
    pmids = literature.pubmed_client.get_ids_for_gene('H2AX')

    # Chunk 3
    from indra import literature
    paper_contents = {}
    for pmid in pmids:
        content, content_type = literature.get_full_text(pmid, 'pmid')
        if content_type == 'abstract':
            paper_contents[pmid] = content
        if len(paper_contents) == 5:  # Is 10 in actual code
            break

    # Chunk 4
    from indra.sources import reach

    literature_stmts = []
    for pmid, content in paper_contents.items():
        rp = reach.process_text(content, url=reach.local_text_url)
        literature_stmts += rp.statements
    print('Got %d statements' % len(literature_stmts))
    assert literature_stmts  # replaces a print statements

    # Chunk 6
    from indra.tools import assemble_corpus as ac
    # stmts = biopax_stmts + bel_stmts + literature_stmts  # tested elsewhere
    stmts = gn_stmts + literature_stmts  # Added instead of above line
    stmts = ac.map_grounding(stmts)
    stmts = ac.map_sequence(stmts)
    stmts = ac.run_preassembly(stmts)
    assert stmts

    # Chunk 7
    from indra.assemblers.cx import CxAssembler
    from indra.databases import ndex_client
    cxa = CxAssembler(stmts)
    cx_str = cxa.make_model()
    assert cx_str

    # Chunk 8
    # ndex_cred = {'user': '******', 'password': '******'}
    # network_id = ndex_client.create_network(cx_str, ndex_cred)
    # print(network_id)

    # Chunk 9
    from indra.assemblers.indranet import IndraNetAssembler
    indranet_assembler = IndraNetAssembler(statements=stmts)
    indranet = indranet_assembler.make_model()
    assert len(indranet.nodes) > 0, 'indranet conatins no nodes'
    assert len(indranet.edges) > 0, 'indranet conatins no edges'

    # Chunk 10
    import networkx as nx
    paths = nx.single_source_shortest_path(G=indranet, source='H2AX', cutoff=1)
    assert paths

    # Chunk 11
    from indra.assemblers.pysb import PysbAssembler
    pysb = PysbAssembler(statements=stmts)
    pysb_model = pysb.make_model()
    assert pysb_model
예제 #11
0
#exploring pubmed search

from indra import literature
from indra.sources import reach

#some constants
retmax = 100000
query = 'Magnesium AND CKD' #'elevated homocysteine' #'cardiac arrest' #'atrial fibrillation' #intention to run against an array of query strings 
#code
pmids = literature.pubmed_client.get_ids(query, retmax=retmax)

print('Got %d pmids' % len(pmids))
#print(pmids)
paper_contents = {}
for pmid in pmids:
    content, content_type = literature.get_full_text(pmid, 'pmid')
    if content_type == 'abstract':
        paper_contents[pmid] = content
    if len(paper_contents) == 10:
        break
#TODO need to move analysis up into the above loop
# in order to add PMID to each into the statements made from Reach so we can track docID
# WAIT! Below, we know what the pmid is; we can use that
literature_stmts = []
for pmid, content in paper_contents.items():
  print(pmid)
  print(content)
  rp = reach.process_text(content, url=reach.local_text_url)
  #TODO add a forloop here which takes all statements and adds pmid to the statement
  # then adds that statement to literature_stmts
  # turns out that doesn't work: sx == statement which is not JSON
예제 #12
0
from indra import reach
from indra.literature import pmc_client, get_full_text, id_lookup
from assembly_eval import have_file, run_assembly

if __name__ == "__main__":
    folder = "reach"
    pmc_ids = [s.strip() for s in open("pmcids.txt", "rt").readlines()]
    pmids = [id_lookup(pmcid)["pmid"] for pmcid in pmc_ids]
    # Set to True only if reading should be ran again
    rerun = False

    # Download the papers if they are not available yet
    for pmcid in pmc_ids:
        prefix = folder + "/" + pmcid
        if not have_file(prefix + ".nxml") and not have_file(prefix + ".txt"):
            txt, txt_format = get_full_text(pmcid)
            if txt_format == "nxml":
                fname = prefix + ".nxml"
            else:
                fname = prefix + ".txt"
            with open(fname, "wt") as fh:
                fh.write(txt.encode("utf-8"))

    # Read each paper if it hasn't been read yet.
    # Otherwise use the existing json extractions.
    for pmcid, pmid in zip(pmc_ids, pmids):
        prefix = folder + "/" + pmcid
        print "Processing %s..." % pmcid
        # If REACH already processed it then don't run it again
        if rerun or not have_file(prefix + ".json"):
            if have_file(prefix + ".txt"):
예제 #13
0
def test_get_full_text_pmc():
    txt, txt_format = get_full_text('PMC4322985', 'pmcid')
    assert txt_format == 'pmc_oa_xml'
    assert len(txt) > 300000
    assert unicode_strs((txt, txt_format))
예제 #14
0
def test_get_full_text_pubmed_abstract():
    # DOI lookup in CrossRef fails for this one because of page mismatch
    txt, txt_format = get_full_text('27075779', 'pmid')
    assert txt_format == 'abstract'
    assert len(txt) > 800
    assert unicode_strs((txt, txt_format))
예제 #15
0
def test_get_full_text_doi():
    txt, txt_format = get_full_text('10.18632/oncotarget.2555', 'doi')
    assert txt_format == 'pmc_oa_xml'
    assert len(txt) > 300000
    assert unicode_strs((txt, txt_format))