def test_full_upload(): "Test whether we can perform a targeted upload to a test db." # This uses a specially curated sample directory designed to access most # code paths that the real system might experience, but on a much smaller # (thus faster) scale. Errors in the ftp service will not be caught by # this test. # Test the medline/pubmed upload. db, pm = get_test_db_with_pubmed_content(with_pm=True) tr_list = db.select_all('text_ref') assert len(tr_list), "No text refs were added..." assert all([hasattr(tr, 'pmid') for tr in tr_list]),\ 'All text_refs MUST have pmids by now.' mra_list = db.select_all(db.MeshRefAnnotations) num_mra_exp = sum(len(ann) for ann in pm.annotations.values()) assert len(mra_list) == num_mra_exp,\ "Only %s/%s annotations added" % (len(mra_list), num_mra_exp) assert all([hasattr(mra, 'mesh_num') for mra in mra_list]), \ 'All MESH annotations should have a mesh ID Number.' # Test the pmc oa upload. PmcOA(ftp_url=get_test_ftp_url(), local=True).populate(db) tcs_pmc = db.filter_query( db.TextContent, db.TextContent.source == PmcOA.my_source).count() assert tcs_pmc, "No pmc oa fulltext was added." trs_w_pmcids = db.filter_query(db.TextRef, db.TextRef.pmcid.isnot(None)).count() assert trs_w_pmcids >= tcs_pmc,\ "Only %d of at least %d pmcids added." % (trs_w_pmcids, tcs_pmc) # Test the manuscripts upload. Manuscripts(ftp_url=get_test_ftp_url(), local=True).populate(db) tcs_manu = db.filter_query( db.TextContent, db.TextContent.source == Manuscripts.my_source).count() assert tcs_manu, "No manuscripts uploaded." trs_w_mids = db.filter_query(db.TextRef, db.TextRef.manuscript_id.isnot(None)).count() assert trs_w_mids >= tcs_manu,\ "Only %d of at least %d manuscript ids added." % (trs_w_mids, tcs_manu) # Some overal checks. tc_list = db.select_all(db.TextContent) set_exp = {('manuscripts', 'xml', 'fulltext'), ('pmc_oa', 'xml', 'fulltext'), ('pubmed', 'text', 'abstract'), ('pubmed', 'text', 'title')} set_got = set([(tc.source, tc.format, tc.text_type) for tc in tc_list]) assert set_exp == set_got,\ "Expected %s, got %s for content layout." % (set_exp, set_got) # Test careful upload of medline (very shallow test...checks only for # critical failures) m = Pubmed(ftp_url=get_test_ftp_url(), local=True) m.load_files(db, 'baseline', carefully=True)
def get_test_db_with_pubmed_content(with_pm=False): "Populate the database with sample content from pubmed." db = get_temp_db(clear=True) pm = Pubmed(ftp_url=get_test_ftp_url(), local=True) pm.populate(db) if with_pm: return db, pm else: return db
def test_multible_pmc_oa_content(): "Test to make sure repeated content is handled correctly." db = get_temp_db() pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True) pmc.populate(db) num_conts = len(db.select_all('text_content')) pmc.populate(db) assert len(db.select_all('text_content')) == num_conts,\ "Duplicate text content allowed to be submitted." return
def test_multiple_pmids(): "Test that pre-existing pmids are correctly handled." db = get_temp_db() med = Pubmed(ftp_url=get_test_ftp_url(), local=True) med.populate(db) num_refs = len(db.select_all('text_ref')) med.populate(db) assert len(db.select_all('text_ref')) == num_refs,\ "Duplicate pmids allowed to be submitted.." return
def test_multiple_text_ref_pmc_oa(): "Test whether a duplicate text ref in pmc oa is handled correctly." db = get_temp_db() pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True) pmc.review_fname = 'test_review_multiple_text_ref_pmc_oa.txt' inp = dict.fromkeys(pmc.tr_cols) inp.update(pmcid='PMC5579538', doi='10.1021/acsomega.7b00205') pmc.upload_batch(db, [inp], []) num_refs = len(db.select_all('text_ref')) pmc.upload_batch(db, [inp], []) assert len(db.select_all('text_ref')) == num_refs,\ "Duplicate refs allowed to be submitted.." remove(pmc.review_fname) return
def get_test_db_with_ftp_content(): "Populate database with content from all the ftp services" db = get_test_db_with_pubmed_content() PmcOA(ftp_url=get_test_ftp_url(), local=True).populate(db) Manuscripts(ftp_url=get_test_ftp_url(), local=True).populate(db) return db
def test_medline_ref_checks(): "Test the text ref checks used by medline." db = get_temp_db(clear=True) med = Pubmed(ftp_url=get_test_ftp_url(), local=True) def check_input(input_pairs, expected_pairs, carefully, num): article_info = { pmid: dict(zip(['pmid', 'pmcid'], [pmid, pmcid])) for pmid, pmcid in input_pairs } med.load_text_refs(db, article_info, carefully) actual_pairs = [(tr.pmid, tr.pmcid) for tr in db.select_all(db.TextRef)] desc = 'careful' if carefully else 'careless' msg = 'DB text refs mismatch after upload %d (%s)' % (num, desc) actual_pairs.sort(key=str) expected_pairs.sort(key=str) assert_contents_equal(actual_pairs, expected_pairs, msg) expected_pairs = [('CASEA', None), ('CASEB', 'PMCIDCASEB'), ('CASEC', None), ('CASED', 'PMCIDCASED')] # Upload round 1 check_input([('CASEA', None), ('CASEB', 'PMCIDCASEB'), ('CASEC', None), ('CASEC', None), ('CASED', None), ('CASED', 'PMCIDCASED')], expected_pairs, False, 1) # Upload round 2 expected_pairs += [('CASEE', None)] check_input([('CASEE', None), ('CASEC', 'PMCIDCASEC'), ('CASEH1', 'PMCIDCASEH'), ('CASEK', 'PMCIDCASEK1')], expected_pairs + [('CASEH1', 'PMCIDCASEH'), ('CASEK', 'PMCIDCASEK1')], False, 2) # Interlude db.insert_many('text_ref', [ { 'pmcid': 'PMCIDCASEG' }, ]) # Upload round 3 input_pairs = expected_pairs + [ ('CASEF', None), ('CASEC', 'PMCIDCASEC'), ('CASEG', 'PMCIDCASEG'), ('CASEH2', 'PMCIDCASEH'), # this should trigger a review. ('CASEK', 'PMCIDCASEK2') # and so should this ] expected_pairs.remove(('CASEC', None)) expected_pairs += [('CASEF', None), ('CASEC', 'PMCIDCASEC'), ('CASEG', 'PMCIDCASEG'), ('CASEH1', 'PMCIDCASEH'), ('CASEK', 'PMCIDCASEK1')] med.review_fname = 'test_review_%s.txt' % med.my_source open(med.review_fname, 'a+').close() with open(med.review_fname, 'r') as f: num_orig_lines = len(f.readlines()) check_input(input_pairs, expected_pairs, True, 3) with open(med.review_fname, 'r') as f: lines = f.readlines() assert len(lines) == num_orig_lines + 2, \ "Not all new reviews added: %d / %d" % (len(lines), num_orig_lines + 2) remove(med.review_fname) return
def test_id_handling_pmc_oa(): "Test every conceivable combination pmid/pmcid presence." db = get_temp_db(clear=True) pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True) # Initialize with all possible states we could have gotten from medline. pm_inp_tpl_list = capitalize_list_of_tpls( [('caseA%d' % i, 'PMCcaseA%d' % i) for i in range(2)] + [('caseB%d' % i, None) for i in range(2)] + [(None, 'PMCcaseC%d' % i) for i in range(2)] + [('caseMisMatchA', 'PMCcaseMisMatchB'), ('caseMisMatchB', 'PMCcaseMisiMatchB'), ('caseMultiMatch', 'PMCcaseMultiMatch'), ('28884161', None), ('26977217', 'PMC4771487')]) db.insert_many('text_ref', [dict(zip(('pmid', 'pmcid'), d)) for d in pm_inp_tpl_list]) # Prepare the 'batch' to be submitted for pmc oa, and try it. oa_inp_tpl_list = capitalize_list_of_tpls( [('case%s0' % l, 'PMCcase%s0' % l) for l in ['A', 'B', 'C']] + [(None, 'PMCcase%s1' % l) for l in ['A', 'B', 'C']] + [ (None, 'PMC5579538'), # lookup pmid in db (None, 'PMC4238023'), # lookup no pmid in db ('26977217', 'PMC5142709'), # conflicting pmcid ('caseMisMatchB', 'PMCcaseMisMatchA'), # multiple matches ('caseMultiMatch', 'PMCnotmatching'), ('notmatching', 'PMCcaseMultiMatch'), ]) tr_inp = [] for pmid, pmcid in oa_inp_tpl_list: inp_dict = dict.fromkeys(pmc.tr_cols) inp_dict.update(pmcid=pmcid, pmid=pmid) tr_inp.append(inp_dict) tc_inp = [{ 'pmcid': pmcid, 'text_type': 'txt', 'content': b'content' } for _, pmcid in oa_inp_tpl_list] pmc.review_fname = 'test_review_%s.txt' % pmc.my_source pmc.upload_batch(db, tr_inp, tc_inp) # Check the text refs. expected_pairs = capitalize_list_of_tpls([ ('caseA0', 'PMCcaseA0'), ('caseA1', 'PMCcaseA1'), ('caseB0', 'PMCcaseB0'), ('caseB1', None), # in practice this should be resolved with id_lookup ('caseC0', 'PMCcaseC0'), (None, 'PMCcaseC1'), ('28884161', 'PMC5579538'), ('26977217', 'PMC4771487'), (None, 'PMCcaseB1'), ('25409783', 'PMC4238023'), ('caseMisMatchA', 'PMCcaseMisMatchB'), ('caseMisMatchB', 'PMCcaseMisiMatchB'), ('caseMultiMatch', 'PMCcaseMultiMatch'), ]) actual_pairs = [(tr.pmid, tr.pmcid) for tr in db.select_all('text_ref')] assert_contents_equal(actual_pairs, expected_pairs, 'DB text refs incorrect.') with open(pmc.review_fname, 'r') as f: found_conflict_msg = False for line in f.read().splitlines(): if all([ word in line for word in ['PMC4771487', 'PMC5142709', 'conflicting pmcid'] ]): found_conflict_msg = True break assert found_conflict_msg # Check the text content assert len(db.select_all('text_content')) is 8, 'Too much DB text content.' remove(pmc.review_fname) return