def get_db_with_ftp_content(): "Populate database with content from all the ftp services" from indra_db.managers.content_manager import PmcOA, Manuscripts db = get_db_with_pubmed_content() PmcOA(ftp_url=get_test_ftp_url(), local=True).populate(db) Manuscripts(ftp_url=get_test_ftp_url(), local=True).populate(db) return db
def test_full_upload(): "Test whether we can perform a targeted upload to a test db." # This uses a specially curated sample directory designed to access most # code paths that the real system might experience, but on a much smaller # (thus faster) scale. Errors in the ftp service will not be caught by # this test. # Test the medline/pubmed upload. db, pm = get_test_db_with_pubmed_content(with_pm=True) tr_list = db.select_all('text_ref') assert len(tr_list), "No text refs were added..." assert all([hasattr(tr, 'pmid') for tr in tr_list]),\ 'All text_refs MUST have pmids by now.' mra_list = db.select_all(db.MeshRefAnnotations) num_mra_exp = sum(len(ann) for ann in pm.annotations.values()) assert len(mra_list) == num_mra_exp,\ "Only %s/%s annotations added" % (len(mra_list), num_mra_exp) assert all([hasattr(mra, 'mesh_num') for mra in mra_list]), \ 'All MESH annotations should have a mesh ID Number.' # Test the pmc oa upload. PmcOA(ftp_url=get_test_ftp_url(), local=True).populate(db) tcs_pmc = db.filter_query( db.TextContent, db.TextContent.source == PmcOA.my_source).count() assert tcs_pmc, "No pmc oa fulltext was added." trs_w_pmcids = db.filter_query(db.TextRef, db.TextRef.pmcid.isnot(None)).count() assert trs_w_pmcids >= tcs_pmc,\ "Only %d of at least %d pmcids added." % (trs_w_pmcids, tcs_pmc) # Test the manuscripts upload. Manuscripts(ftp_url=get_test_ftp_url(), local=True).populate(db) tcs_manu = db.filter_query( db.TextContent, db.TextContent.source == Manuscripts.my_source).count() assert tcs_manu, "No manuscripts uploaded." trs_w_mids = db.filter_query(db.TextRef, db.TextRef.manuscript_id.isnot(None)).count() assert trs_w_mids >= tcs_manu,\ "Only %d of at least %d manuscript ids added." % (trs_w_mids, tcs_manu) # Some overal checks. tc_list = db.select_all(db.TextContent) set_exp = {('manuscripts', 'xml', 'fulltext'), ('pmc_oa', 'xml', 'fulltext'), ('pubmed', 'text', 'abstract'), ('pubmed', 'text', 'title')} set_got = set([(tc.source, tc.format, tc.text_type) for tc in tc_list]) assert set_exp == set_got,\ "Expected %s, got %s for content layout." % (set_exp, set_got) # Test careful upload of medline (very shallow test...checks only for # critical failures) m = Pubmed(ftp_url=get_test_ftp_url(), local=True) m.load_files(db, 'baseline', carefully=True)
def test_multible_pmc_oa_content(): "Test to make sure repeated content is handled correctly." db = get_temp_db() pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True) pmc.populate(db) num_conts = len(db.select_all('text_content')) pmc.populate(db) assert len(db.select_all('text_content')) == num_conts,\ "Duplicate text content allowed to be submitted." return
def test_multiple_text_ref_pmc_oa(): "Test whether a duplicate text ref in pmc oa is handled correctly." db = get_temp_db() pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True) pmc.review_fname = 'test_review_multiple_text_ref_pmc_oa.txt' inp = dict.fromkeys(pmc.tr_cols) inp.update(pmcid='PMC5579538', doi='10.1021/acsomega.7b00205') pmc.upload_batch(db, [inp], []) num_refs = len(db.select_all('text_ref')) pmc.upload_batch(db, [inp], []) assert len(db.select_all('text_ref')) == num_refs,\ "Duplicate refs allowed to be submitted.." remove(pmc.review_fname) return
def build_set(n, parent_dir): """Create the nastiest set of content we're willing/able to handle. We create a small local representation of the entirety of the NLM repositories we use, including all the nasty corner cases we can manage. This allows for rapid development and testing. Parameters ---------- n : int The number of instances (distinct articles) of each test case to be included. Examples are chosen as randomly as possible. Multiple samples generally increase the reliability of the test. parent_dir : str The head of the tree that stands in place of the url to the nih ftp directory. """ # Create the necessary directories. def get_path(sub_path): return os.path.join(parent_dir, sub_path) if os.path.exists(parent_dir): shutil.rmtree(parent_dir) os.makedirs(parent_dir) os.makedirs(get_path('pub/pmc')) os.makedirs(get_path('pubmed/baseline')) os.makedirs(get_path('pub/pmc/manuscript')) # Get the pmid data from medline (med_pmid_list) print("Getting medline lists...") med = Pubmed() # This resource appears to have disappeared, sadly. # med_pmid_list = [] # for i in range(1, 7): # buf = BytesIO() # med.ftp.ret_file("MuId-PmId-%d.zip" % i, buf) # zf = zipfile.ZipFile(buf) # with zf.open(zf.namelist()[0]) as id_f: # id_str = id_f.read().decode('utf8') # med_pmid_list += [l.split('\t')[1] for l in id_str.splitlines()] statementful_pmids = [ '20949557', '23898069', '19801969', '21042724', '14675752', '25897078', '25486481', '12890751', '11251186', '20622853', '25616414', '21878640', '23295773', '19747910', '25778309', '25939761', '11871856', '16580132', '24730770', '23921085', '22018470', '19405127', '21464949', '18321309', '7907095', '12048232', '23751074', '18711136', '13679391', '22193543', '26645886', '27086966', '14570914', '20538416', '9417079', '23200589', '15146469', '18084123', '19265534', '19449221', '27381626', '14976202', '22445724', '20040392', '26039245', '17881156', '15902258', '1745350', '18276758', '22764095', '20652941', '25834816', '23068100', '16407218', '18830263', '24265318', '19752028', '8589722', '22671588', '14745431', '25042645', '19403642', '14707024', '23536437', '21167476', '22801439', '25726184', '19723643', '17409824', '28679432', '26908611', '20164468', '15189946', '12086229', '21900397', '12324477', '15545228', '23376846', '21719749', '20608972', '23583295', '23236067', '9705962', '20068183', '19437340', '14534726', '25731731', '15337767', '28067895', '25092803', '19261749', '22272295', '27121230', '23302038', '17410335', '17399955', '16254247', '21685363', '26598524', '25645929', '1386335', '20606534', '22492281', '22158902', '22022427', '24775712', '21298412', '24753544', '12553064', '19681600', '17912454', '17597401', '20672986', '21362231', '17999917', '21470928', '27334922', '16159962', '21079653', '15125833', '27617579', '19048115', '18687691', '27797218', '26413934', '16684954', '20501406', '27515963', '22784503', '25941399', '12473120', '17891137', '16733295', '23826126', '21427728', '8900182', '26234677', '24648515', '25786138', '12958678', '16998791', '19061835', '11283269', '18258923', '11839584', '20132317', '19158374', '23245941', '23352210', '15465819', '15386433', '22575647', '15966238', '23633483', '25131797', '17102080', '19956840', '18506362', '17961162', '1607067', '24770328', '19825990', '22365656', '19720761', '24435975', '26882953', '17292826', '25119113', '26044620', '20717925', '15316008', '16619041', '19893488', '26999786', '26103054', '17331464', '20022966', '24189165', '19059939', '25474223', '20507346', '20976540', '2810532', '15685397', '27562587', '18538673', '15712349', '15448517', '27467210', '7584044', '21330319', '18381962', '24789704', '19058873', '10523313' ] elsevier_pmids = [ "140233", "126700", "138421", "131864", "122916", "127363", "130834", "135691", "147139", "142190", "124378", "132969", "127549", "131583", "148910", "140686", "126304", "124909", "145863", "127687", "143909", "134286", "144524", "145955", "125088", "122895", "144611", "152202", "140767", "139895", "152644", "140057", "149561", "143963", "136992", "137557", "144535", "148891", "145321", "133684", "126386", "148890", "124210", "131711", "124967", "138753", "132192", "142510", "130244", "123485", "126883", "151536", "126948", "137419", "141952", "130051", "122816", "150450", "133686", "126866", "138748", "149542", "144038", "145957", "136213", "148513", "141931", "140056", "139935", "123177", "124593", "141942", "133729", "124598", "124252", "126303", "152671", "141908", "124625", "152721", "150335", "133685", "150977", "124154", "140713", "146095", "123742", "140478", "143938", "140806", "124600", "123729", "127548", "145041", "139938", "143289", "131554", "125206", "142661", "122933" ] # Get the data from pmc oa (pmc_dicts) print("Getting pmc oa lists....") pmc = PmcOA() pmc_dicts = pmc.ftp.get_csv_as_dict('oa_file_list.csv', header=0) # Get the data for the manuscripts (man_dicts) print("Getting manuscript lists...") man = Manuscripts() man_dicts = man.ftp.get_csv_as_dict('filelist.csv', header=0) # Get pmid, pmcid, mid tuples for the examples that we will use. print("Generating example sets...") examples = [] for case in [(1, 0, 0), (1, 1, 0), (0, 1, 0), (1, 1, 1), (1, 0, 1)]: for _ in range(n): example = _get_example(case, statementful_pmids + elsevier_pmids, pmc_dicts, man_dicts) examples.append(example) # Add a few pmids that probably include some statements. for pmid in random.sample(statementful_pmids, n): examples.append((pmid, '', '')) # Add a few pmids that link to elsevier content for pmid in random.sample(elsevier_pmids, n): examples.append((pmid, '', '')) # Add a special article to check article info. year_nums = str(datetime.now().year)[-2:] double_doi_info = med.get_article_info('baseline/pubmed%sn0343.xml.gz' % year_nums) pmids_w_double_doi = [ k for k, v in double_doi_info.items() if v['doi'] is not None and len(v['doi']) > 100 ] assert len(pmids_w_double_doi), "No double dois found." examples.append(( random.choice(pmids_w_double_doi), '', '', )) # Create the test medline file. print("Creating medline test file...") pmid_list = [pmid for pmid, _, _ in examples if pmid != ''] tree = None for pmid in pmid_list: params = {'db': 'pubmed', 'retmode': 'xml', 'id': pmid} if tree is None: tree = pub.send_request(pub.pubmed_fetch, params) else: resp = pub.send_request(pub.pubmed_fetch, params) attempts = 1 while not resp and attempts <= 3: resp = pub.send_request(pub.pubmed_fetch, params) attempts += 1 sleep(1) child = resp.getchildren()[0] tree.append(child) sleep(0.5) if tree is not None: f_bts = b'' f_bts += b"<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" f_bts += ET.tostring(tree) f_path = get_path('pubmed/baseline/pubmed18nTEST.xml.gz') with open(f_path, 'wb') as gzf: gzf.write(gzip.compress(f_bts)) # Create the test pmc oa article directory. print("Getting pmc oa xmls...") art_dirname = get_path('pub/pmc/articles.TEST.xml') if os.path.exists(art_dirname): shutil.rmtree(art_dirname) os.mkdir(art_dirname) pmcid_list = [pmcid for _, pmcid, _ in examples if pmcid != ''] ex_pmc_dicts = [d for d in pmc_dicts if d['Accession ID'] in pmcid_list] for d in ex_pmc_dicts: fname = pmc.ftp.download_file(d['File']) with tarfile.open(fname, 'r:gz') as tar: mems = tar.getmembers() mem = [mem for mem in mems if mem.name.endswith('.nxml')][0] f_str = tar.extractfile(mem).read() fname = d['Accession ID'] + '.nxml' re_ret = re.findall('<journal-title>(.*?)</journal-title>', f_str.decode('utf8')) if len(re_ret): sub_dir = os.path.join( art_dirname, re_ret[0].replace(' ', '_').replace('&', '')) else: sub_dir = os.path.join(art_dirname, 'unknown') if not os.path.exists(sub_dir): os.mkdir(sub_dir) path = os.path.join(sub_dir, fname) with open(path, 'wb') as f: f.write(f_str) with tarfile.open(art_dirname + '.tar.gz', 'w:gz') as tar: for dirname in os.listdir(art_dirname): tar.add(os.path.join(art_dirname, dirname), arcname=dirname) shutil.rmtree(art_dirname) # Create deleted pmids file (just make an empty file,for now. # TODO: Add test case to touch this. with open(get_path('pubmed/deleted.pmids.gz'), 'wb') as gzf: gzf.write(gzip.compress(b'')) # Create the test manuscripts file. print('Adding manuscript directories...') dirfmt = get_path('pub/pmc/manuscript/%s') dirnames = [dirfmt % ('PMC00%dXXXXXX.xml' % i) for i in range(2, 6)] for dirname in dirnames: if os.path.exists(dirname): shutil.rmtree(dirname) os.mkdir(dirname) ex_man_dicts = [d for d in man_dicts if d['PMCID'] in pmcid_list] for d in ex_man_dicts: d['Tarfile'] = man.get_tarname_from_filename(d['File']) tar_members = dict.fromkeys(set([d['Tarfile'] for d in ex_man_dicts])) for tarname in tar_members.keys(): if not os.path.exists(tarname): print("\tDownloading %s..." % tarname) man.ftp.download_file(tarname) for d in ex_man_dicts: parent_dir = os.path.join(dirfmt % tarname.replace('.tar.gz', ''), os.path.dirname(d['File'])) test_fname = os.path.join(dirfmt % tarname.replace('.tar.gz', ''), d['File']) try: with tarfile.open(d['Tarfile'], 'r:gz') as tar: print('\tExtracting %s from %s...' % (d['File'], d['Tarfile'])) tar.extract(d['File']) except KeyError: print("WARNING: Could not extract %s from %s" % (d['File'], d['Tarfile'])) continue if not os.path.exists(parent_dir): os.makedirs(parent_dir) os.rename(d['File'], test_fname) for dirname in dirnames: with tarfile.open(dirname + '.tar.gz', 'w:gz') as tar: for sub_dirname in os.listdir(dirname): tar.add(os.path.join(dirname, sub_dirname), arcname=sub_dirname) shutil.rmtree(dirname) return examples
def get_test_db_with_ftp_content(): "Populate database with content from all the ftp services" db = get_test_db_with_pubmed_content() PmcOA(ftp_url=get_test_ftp_url(), local=True).populate(db) Manuscripts(ftp_url=get_test_ftp_url(), local=True).populate(db) return db
def test_id_handling_pmc_oa(): "Test every conceivable combination pmid/pmcid presence." db = get_temp_db(clear=True) pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True) # Initialize with all possible states we could have gotten from medline. pm_inp_tpl_list = capitalize_list_of_tpls( [('caseA%d' % i, 'PMCcaseA%d' % i) for i in range(2)] + [('caseB%d' % i, None) for i in range(2)] + [(None, 'PMCcaseC%d' % i) for i in range(2)] + [('caseMisMatchA', 'PMCcaseMisMatchB'), ('caseMisMatchB', 'PMCcaseMisiMatchB'), ('caseMultiMatch', 'PMCcaseMultiMatch'), ('28884161', None), ('26977217', 'PMC4771487')]) db.insert_many('text_ref', [dict(zip(('pmid', 'pmcid'), d)) for d in pm_inp_tpl_list]) # Prepare the 'batch' to be submitted for pmc oa, and try it. oa_inp_tpl_list = capitalize_list_of_tpls( [('case%s0' % l, 'PMCcase%s0' % l) for l in ['A', 'B', 'C']] + [(None, 'PMCcase%s1' % l) for l in ['A', 'B', 'C']] + [ (None, 'PMC5579538'), # lookup pmid in db (None, 'PMC4238023'), # lookup no pmid in db ('26977217', 'PMC5142709'), # conflicting pmcid ('caseMisMatchB', 'PMCcaseMisMatchA'), # multiple matches ('caseMultiMatch', 'PMCnotmatching'), ('notmatching', 'PMCcaseMultiMatch'), ]) tr_inp = [] for pmid, pmcid in oa_inp_tpl_list: inp_dict = dict.fromkeys(pmc.tr_cols) inp_dict.update(pmcid=pmcid, pmid=pmid) tr_inp.append(inp_dict) tc_inp = [{ 'pmcid': pmcid, 'text_type': 'txt', 'content': b'content' } for _, pmcid in oa_inp_tpl_list] pmc.review_fname = 'test_review_%s.txt' % pmc.my_source pmc.upload_batch(db, tr_inp, tc_inp) # Check the text refs. expected_pairs = capitalize_list_of_tpls([ ('caseA0', 'PMCcaseA0'), ('caseA1', 'PMCcaseA1'), ('caseB0', 'PMCcaseB0'), ('caseB1', None), # in practice this should be resolved with id_lookup ('caseC0', 'PMCcaseC0'), (None, 'PMCcaseC1'), ('28884161', 'PMC5579538'), ('26977217', 'PMC4771487'), (None, 'PMCcaseB1'), ('25409783', 'PMC4238023'), ('caseMisMatchA', 'PMCcaseMisMatchB'), ('caseMisMatchB', 'PMCcaseMisiMatchB'), ('caseMultiMatch', 'PMCcaseMultiMatch'), ]) actual_pairs = [(tr.pmid, tr.pmcid) for tr in db.select_all('text_ref')] assert_contents_equal(actual_pairs, expected_pairs, 'DB text refs incorrect.') with open(pmc.review_fname, 'r') as f: found_conflict_msg = False for line in f.read().splitlines(): if all([ word in line for word in ['PMC4771487', 'PMC5142709', 'conflicting pmcid'] ]): found_conflict_msg = True break assert found_conflict_msg # Check the text content assert len(db.select_all('text_content')) is 8, 'Too much DB text content.' remove(pmc.review_fname) return