Пример #1
0
def id_lookup(paper_id, idtype=None):
    """This function takes a Pubmed ID, Pubmed Central ID, or DOI
    and use the Pubmed ID mapping
    service and looks up all other IDs from one
    of these. The IDs are returned in a dictionary."""
    if idtype is not None and idtype not in ('pmid', 'pmcid', 'doi'):
        raise ValueError("Invalid idtype %s; must be 'pmid', 'pmcid', "
                         "or 'doi'." % idtype)
    if paper_id.upper().startswith('PMC'):
        idtype = 'pmcid'
    # Strip off any prefix
    if paper_id.upper().startswith('PMID'):
        paper_id = paper_id[4:]
    elif paper_id.upper().startswith('DOI'):
        paper_id = paper_id[3:]
    data = {'ids': paper_id}
    if idtype is not None:
        data['idtype'] = idtype
    try:
        tree = pubmed_client.send_request(pmid_convert_url, data)
    except Exception as e:
        logger.error('Error looking up PMID in PMC: %s' % e)
        return {}
    if tree is None:
        return {}
    record = tree.find('record')
    if record is None:
        return {}
    doi = record.attrib.get('doi')
    pmid = record.attrib.get('pmid')
    pmcid = record.attrib.get('pmcid')
    ids = {'doi': doi, 'pmid': pmid, 'pmcid': pmcid}
    return ids
Пример #2
0
def id_lookup(paper_id, idtype=None):
    """This function takes a Pubmed ID, Pubmed Central ID, or DOI
    and use the Pubmed ID mapping
    service and looks up all other IDs from one
    of these. The IDs are returned in a dictionary."""
    if idtype is not None and idtype not in ('pmid', 'pmcid', 'doi'):
        raise ValueError("Invalid idtype %s; must be 'pmid', 'pmcid', "
                         "or 'doi'." % idtype)
    if paper_id.upper().startswith('PMC'):
        idtype = 'pmcid'
    # Strip off any prefix
    if paper_id.upper().startswith('PMID'):
        paper_id = paper_id[4:]
    elif paper_id.upper().startswith('DOI'):
        paper_id = paper_id[3:]
    data = {'ids': paper_id}
    if idtype is not None:
        data['idtype'] = idtype
    tree = pubmed_client.send_request(pmid_convert_url, data)
    if tree is None:
        return {}
    record = tree.find('record')
    if record is None:
        return {}
    doi = record.attrib.get('doi')
    pmid = record.attrib.get('pmid')
    pmcid = record.attrib.get('pmcid')
    ids = {'doi': doi,
           'pmid': pmid,
           'pmcid': pmcid}
    return ids
Пример #3
0
def test_get_annotations():
    pmid = '30971'
    tree = pubmed_client.send_request(
        pubmed_client.pubmed_fetch, dict(db='pubmed', retmode='xml', id=pmid))
    results = pubmed_client.get_metadata_from_xml_tree(tree,
                                                       mesh_annotations=True)
    assert len(results) == 1, len(results)
    assert 'mesh_annotations' in results[pmid].keys(), results[pmid].keys()
    me_ans = results[pmid]['mesh_annotations']
    assert len(me_ans) == 9, len(me_ans)
    assert all(d['mesh'].startswith('D') for d in me_ans)
    assert any(d['major_topic'] for d in me_ans)
Пример #4
0
def test_get_annotations():
    pmid = '30971'
    tree = pubmed_client.send_request(pubmed_client.pubmed_fetch,
                                      dict(db='pubmed', retmode='xml',
                                           id=pmid))
    results = pubmed_client.get_metadata_from_xml_tree(tree,
                                                       mesh_annotations=True)
    assert len(results) == 1, len(results)
    assert 'mesh_annotations' in results[pmid].keys(), results[pmid].keys()
    me_ans = results[pmid]['mesh_annotations']
    assert len(me_ans) == 9, len(me_ans)
    assert all(d['mesh'].startswith('D') for d in me_ans)
    assert any(d['major_topic'] for d in me_ans)
Пример #5
0
def test_send_request_invalid():
    time.sleep(0.5)
    res = pubmed_client.send_request('http://xxxxxxx', data={})
    assert res is None
Пример #6
0
def build_set(n, parent_dir):
    """Create the nastiest set of content we're willing/able to handle.

    We create a small local representation of the entirety of the NLM
    repositories we use, including all the nasty corner cases we can manage.
    This allows for rapid development and testing.

    Parameters
    ----------
    n : int
        The number of instances (distinct articles) of each test case to be
        included. Examples are chosen as randomly as possible. Multiple samples
        generally increase the reliability of the test.
    parent_dir : str
        The head of the tree that stands in place of the url to the nih ftp
        directory.
    """

    # Create the necessary directories.
    def get_path(sub_path):
        return os.path.join(parent_dir, sub_path)

    if os.path.exists(parent_dir):
        shutil.rmtree(parent_dir)
    os.makedirs(parent_dir)
    os.makedirs(get_path('pub/pmc'))
    os.makedirs(get_path('pubmed/baseline'))
    os.makedirs(get_path('pub/pmc/manuscript'))

    # Get the pmid data from medline (med_pmid_list)
    print("Getting medline lists...")
    med_pmid_list = []
    med = Pubmed()
    for i in range(1, 7):
        buf = BytesIO()
        med.ftp.ret_file("MuId-PmId-%d.zip" % i, buf)
        zf = zipfile.ZipFile(buf)
        with zf.open(zf.namelist()[0]) as id_f:
            id_str = id_f.read().decode('utf8')
        med_pmid_list += [l.split('\t')[1] for l in id_str.splitlines()]

    statementful_pmids = [
        '20949557', '23898069', '19801969', '21042724', '14675752', '25897078',
        '25486481', '12890751', '11251186', '20622853', '25616414', '21878640',
        '23295773', '19747910', '25778309', '25939761', '11871856', '16580132',
        '24730770', '23921085', '22018470', '19405127', '21464949', '18321309',
        '7907095', '12048232', '23751074', '18711136', '13679391', '22193543',
        '26645886', '27086966', '14570914', '20538416', '9417079', '23200589',
        '15146469', '18084123', '19265534', '19449221', '27381626', '14976202',
        '22445724', '20040392', '26039245', '17881156', '15902258', '1745350',
        '18276758', '22764095', '20652941', '25834816', '23068100', '16407218',
        '18830263', '24265318', '19752028', '8589722', '22671588', '14745431',
        '25042645', '19403642', '14707024', '23536437', '21167476', '22801439',
        '25726184', '19723643', '17409824', '28679432', '26908611', '20164468',
        '15189946', '12086229', '21900397', '12324477', '15545228', '23376846',
        '21719749', '20608972', '23583295', '23236067', '9705962', '20068183',
        '19437340', '14534726', '25731731', '15337767', '28067895', '25092803',
        '19261749', '22272295', '27121230', '23302038', '17410335', '17399955',
        '16254247', '21685363', '26598524', '25645929', '1386335', '20606534',
        '22492281', '22158902', '22022427', '24775712', '21298412', '24753544',
        '12553064', '19681600', '17912454', '17597401', '20672986', '21362231',
        '17999917', '21470928', '27334922', '16159962', '21079653', '15125833',
        '27617579', '19048115', '18687691', '27797218', '26413934', '16684954',
        '20501406', '27515963', '22784503', '25941399', '12473120', '17891137',
        '16733295', '23826126', '21427728', '8900182', '26234677', '24648515',
        '25786138', '12958678', '16998791', '19061835', '11283269', '18258923',
        '11839584', '20132317', '19158374', '23245941', '23352210', '15465819',
        '15386433', '22575647', '15966238', '23633483', '25131797', '17102080',
        '19956840', '18506362', '17961162', '1607067', '24770328', '19825990',
        '22365656', '19720761', '24435975', '26882953', '17292826', '25119113',
        '26044620', '20717925', '15316008', '16619041', '19893488', '26999786',
        '26103054', '17331464', '20022966', '24189165', '19059939', '25474223',
        '20507346', '20976540', '2810532', '15685397', '27562587', '18538673',
        '15712349', '15448517', '27467210', '7584044', '21330319', '18381962',
        '24789704', '19058873', '10523313'
    ]

    elsevier_pmids = [
        "140233", "126700", "138421", "131864", "122916", "127363", "130834",
        "135691", "147139", "142190", "124378", "132969", "127549", "131583",
        "148910", "140686", "126304", "124909", "145863", "127687", "143909",
        "134286", "144524", "145955", "125088", "122895", "144611", "152202",
        "140767", "139895", "152644", "140057", "149561", "143963", "136992",
        "137557", "144535", "148891", "145321", "133684", "126386", "148890",
        "124210", "131711", "124967", "138753", "132192", "142510", "130244",
        "123485", "126883", "151536", "126948", "137419", "141952", "130051",
        "122816", "150450", "133686", "126866", "138748", "149542", "144038",
        "145957", "136213", "148513", "141931", "140056", "139935", "123177",
        "124593", "141942", "133729", "124598", "124252", "126303", "152671",
        "141908", "124625", "152721", "150335", "133685", "150977", "124154",
        "140713", "146095", "123742", "140478", "143938", "140806", "124600",
        "123729", "127548", "145041", "139938", "143289", "131554", "125206",
        "142661", "122933"
    ]

    # Get the data from pmc oa (pmc_dicts)
    print("Getting pmc oa lists....")
    pmc = PmcOA()
    pmc_dicts = pmc.ftp.get_csv_as_dict('oa_file_list.csv', header=0)

    # Get the data for the manuscripts (man_dicts)
    print("Getting manuscript lists...")
    man = Manuscripts()
    man_dicts = man.ftp.get_csv_as_dict('filelist.csv', header=0)

    # Get pmid, pmcid, mid tuples for the examples that we will use.
    print("Generating example sets...")
    examples = []
    for case in [(1, 0, 0), (1, 1, 0), (0, 1, 0), (1, 1, 1), (1, 0, 1)]:
        for _ in range(n):
            example = _get_example(case, med_pmid_list, pmc_dicts, man_dicts)
            examples.append(example)

    # Add a few pmids that probably include some statements.
    for pmid in random.sample(statementful_pmids, n):
        examples.append((pmid, '', ''))

    # Add a few pmids that link to elsevier content
    for pmid in random.sample(elsevier_pmids, n):
        examples.append((pmid, '', ''))

    # Add a special article to check article info.
    double_doi_info = med.get_article_info('baseline/pubmed18n0343.xml.gz')
    pmids_w_double_doi = [
        k for k, v in double_doi_info.items()
        if v['doi'] is not None and len(v['doi']) > 100
    ]
    assert len(pmids_w_double_doi), "No double dois found."
    examples.append((
        random.choice(pmids_w_double_doi),
        '',
        '',
    ))

    # Create the test medline file.
    print("Creating medline test file...")
    pmid_list = [pmid for pmid, _, _ in examples if pmid != '']
    tree = None
    for pmid in pmid_list:
        params = {'db': 'pubmed', 'retmode': 'xml', 'id': pmid}
        if tree is None:
            tree = pub.send_request(pub.pubmed_fetch, params)
        else:
            child = pub.send_request(pub.pubmed_fetch, params).getchildren()[0]
            tree.append(child)
    if tree is not None:
        f_bts = b''
        f_bts += b"<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
        f_bts += ET.tostring(tree)
        f_path = get_path('pubmed/baseline/pubmed18nTEST.xml.gz')
        with open(f_path, 'wb') as gzf:
            gzf.write(gzip.compress(f_bts))

    # Create the test pmc oa article directory.
    print("Getting pmc oa xmls...")
    art_dirname = get_path('pub/pmc/articles.TEST.xml')
    if os.path.exists(art_dirname):
        shutil.rmtree(art_dirname)
    os.mkdir(art_dirname)
    pmcid_list = [pmcid for _, pmcid, _ in examples if pmcid != '']
    ex_pmc_dicts = [d for d in pmc_dicts if d['Accession ID'] in pmcid_list]
    for d in ex_pmc_dicts:
        fname = pmc.ftp.download_file(d['File'])
        with tarfile.open(fname, 'r:gz') as tar:
            mems = tar.getmembers()
            mem = [mem for mem in mems if mem.name.endswith('.nxml')][0]
            f_str = tar.extractfile(mem).read()
        fname = d['Accession ID'] + '.nxml'
        re_ret = re.findall('<journal-title>(.*?)</journal-title>',
                            f_str.decode('utf8'))
        if len(re_ret):
            sub_dir = os.path.join(
                art_dirname, re_ret[0].replace(' ', '_').replace('&', ''))
        else:
            sub_dir = os.path.join(art_dirname, 'unknown')
        if not os.path.exists(sub_dir):
            os.mkdir(sub_dir)
        path = os.path.join(sub_dir, fname)
        with open(path, 'wb') as f:
            f.write(f_str)
    with tarfile.open(art_dirname + '.tar.gz', 'w:gz') as tar:
        for dirname in os.listdir(art_dirname):
            tar.add(os.path.join(art_dirname, dirname), arcname=dirname)
    shutil.rmtree(art_dirname)

    # Create deleted pmids file (just make an empty file,for now.
    # TODO: Add test case to touch this.
    with open(get_path('pubmed/deleted.pmids.gz'), 'wb') as gzf:
        gzf.write(gzip.compress(b''))

    # Create the test manuscripts file.
    print('Adding manuscript directories...')
    dirfmt = get_path('pub/pmc/manuscript/%s')
    dirnames = [dirfmt % ('PMC00%dXXXXXX.xml' % i) for i in range(2, 6)]
    for dirname in dirnames:
        if os.path.exists(dirname):
            shutil.rmtree(dirname)
        os.mkdir(dirname)
    ex_man_dicts = [d for d in man_dicts if d['PMCID'] in pmcid_list]
    for d in ex_man_dicts:
        d['Tarfile'] = man.get_tarname_from_filename(d['File'])
    tar_members = dict.fromkeys(set([d['Tarfile'] for d in ex_man_dicts]))
    for tarname in tar_members.keys():
        if not os.path.exists(tarname):
            print("\tDownloading %s..." % tarname)
            man.ftp.download_file(tarname)
    for d in ex_man_dicts:
        parent_dir = os.path.join(dirfmt % tarname.replace('.tar.gz', ''),
                                  os.path.dirname(d['File']))
        test_fname = os.path.join(dirfmt % tarname.replace('.tar.gz', ''),
                                  d['File'])
        with tarfile.open(d['Tarfile'], 'r:gz') as tar:
            print('\tExtracting %s from %s...' % (d['File'], d['Tarfile']))
            tar.extract(d['File'])
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)
        os.rename(d['File'], test_fname)
    for dirname in dirnames:
        with tarfile.open(dirname + '.tar.gz', 'w:gz') as tar:
            for sub_dirname in os.listdir(dirname):
                tar.add(os.path.join(dirname, sub_dirname),
                        arcname=sub_dirname)
        shutil.rmtree(dirname)

    return examples
Пример #7
0
def test_send_request_invalid():
    time.sleep(0.3)
    res = pubmed_client.send_request('http://xxxxxxx', data={})
    assert res is None