def test_09_hybrid_oa(self): def is_hybrid_lookup(msg): return False def is_oa_lookup(msg): return True def is_failed_lookup(msg): return None # Check that an OA record is correctly identified workflow.doaj_lookup = is_oa_lookup record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.hybrid_or_oa(msg) assert record.journal_type == "oa" assert len(record.provenance) == 1 # check that a hybrid journal is correctly identified workflow.doaj_lookup = is_hybrid_lookup record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.hybrid_or_oa(msg) assert record.journal_type == "hybrid" assert len(record.provenance) == 1 # check that no DOAJ check is performed if no issns are present # or alternatively the DOAJ lookup fails for unknown reasons workflow.doaj_lookup = is_failed_lookup record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.hybrid_or_oa(msg) assert record.journal_type is None
def test_10_process_record_03_aam_no_licence(self): def mock_get_md(*args, **kwargs): md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read())) return md, 1.0 def mock_get_ft(*args, **kwargs): data = open(EPMC_FT, "r").read() xml = etree.fromstring(data) l = xml.xpath("//license") l[0].getparent().remove(l[0]) s = etree.tostring(xml) return epmc.EPMCFullText(s) def mock_doaj(*args, **kwargs): return True def mock_romeo(*args, **kwargs): pass def mock_core(*args, **kwargs): pass workflow.get_epmc_md = mock_get_md workflow.get_epmc_fulltext = mock_get_ft workflow.doaj_lookup = mock_doaj workflow.embargo = mock_romeo workflow.ou_core = mock_core record = models.Record() record.pmcid = "PMC4219345" record.id = record.makeid() oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.process_record(msg) assert record.confidence == 1.0 assert record.pmcid == "PMC4219345" assert record.pmid == "24279897" assert record.doi == "10.1186/1471-2121-14-52" assert record.in_epmc is True assert record.is_oa is False assert len(record.issn) == 1 assert "1471-2121" in record.issn assert record.id is not None # implies it has been saved assert record.has_ft_xml is True assert record.aam is True assert record.aam_from_xml is True assert record.licence_type is None assert record.licence_source is None assert record.journal_type == "oa" assert len(oag) == 1 assert oag[0]["id"] == "PMC4219345" assert oag[0]["type"] == "pmcid"
def test_09_hybrid_oa(self): def is_hybrid_lookup(msg): return False def is_oa_lookup(msg): return True # Check that an OA record is correctly identified workflow.doaj_lookup = is_oa_lookup record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.hybrid_or_oa(msg) assert record.journal_type == "oa" assert len(record.provenance) == 1 # check that a hybrid journal is correctly identified workflow.doaj_lookup = is_hybrid_lookup record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.hybrid_or_oa(msg) assert record.journal_type == "hybrid" assert len(record.provenance) == 1
def test_07_ft_info(self): record = models.Record() msg = workflow.WorkflowMessage(record=record) data = open(EPMC_FT, "r").read() ft = epmc.EPMCFullText(data) workflow.extract_fulltext_info(msg, ft) assert record.has_ft_xml is True assert len(record.provenance) == 2 assert record.aam is True assert record.aam_from_xml is True
def test_06_epmc_compliance_data(self): record = models.Record() msg = workflow.WorkflowMessage(record=record) data = json.loads(open(EPMC_MD, "r").read()) epmc_md = epmcmod.EPMCMetadata(data) workflow.extract_metadata(msg, epmc_md) assert record.in_epmc is True assert record.is_oa is False assert len(record.issn) == 1 assert "1471-2121" in record.issn
def test_03_doaj(self): record = models.Record() msg = workflow.WorkflowMessage(record=record) # An OA journal record.issn = "1338-3973" is_oa = workflow.doaj_lookup(msg) assert is_oa is True # a journal that we invented record.issn = "1234-5678" is_oa = workflow.doaj_lookup(msg) assert is_oa is False
def test_02_get_fulltext_xml(self): record = models.Record() msg = workflow.WorkflowMessage(record=record) # a successful fulltext retrieval record.pmcid = PMCID_SUCCESS ft = workflow.get_epmc_fulltext(msg) assert ft is not None assert ft.title == PMCID_SUCCESS_FT_TITLE, ft.title # failed fulltext retrieval record.pmcid = PMCID_ERROR ft = workflow.get_epmc_fulltext(msg) assert ft is None
def test_10_process_record_02_no_md(self): def mock_get_md(*args, **kwargs): return None, None workflow.get_epmc_md = mock_get_md record = models.Record() record.pmcid = "PMC4219345" record.id = record.makeid() oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.process_record(msg) assert record.confidence is None assert len(record.provenance) == 1 assert len(oag) == 0
def test_10_process_record_01_everything(self): def mock_get_md(*args, **kwargs): md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read())) return md, 1.0 def mock_get_ft(*args, **kwargs): data = open(EPMC_FT, "r").read() return epmc.EPMCFullText(data) def mock_doaj(*args, **kwargs): return False def mock_romeo(*args, **kwargs): pass def mock_core(*args, **kwargs): pass workflow.get_epmc_md = mock_get_md workflow.get_epmc_fulltext = mock_get_ft workflow.doaj_lookup = mock_doaj workflow.embargo = mock_romeo workflow.ou_core = mock_core record = models.Record() record.pmcid = "PMC4219345" record.id = record.makeid() oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.process_record(msg) assert record.confidence == 1.0 assert record.pmcid == "PMC4219345" assert record.pmid == "24279897" assert record.doi == "10.1186/1471-2121-14-52" assert record.in_epmc is True assert record.is_oa is False assert len(record.issn) == 1 assert "1471-2121" in record.issn assert record.id is not None # implies it has been saved assert record.has_ft_xml is True assert record.aam is True assert record.aam_from_xml is True assert record.licence_type == "cc-by" assert record.licence_source == "epmc_xml" assert record.journal_type == "hybrid" assert len(oag) == 0
def test_10_process_record_04_licence_no_aam(self): def mock_get_md(*args, **kwargs): md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read())) return md, 1.0 def mock_get_ft(*args, **kwargs): data = open(EPMC_FT, "r").read() xml = etree.fromstring(data) aids = xml.xpath("//article-id[@pub-id-type='manuscript']") aids[0].getparent().remove(aids[0]) s = etree.tostring(xml) return epmc.EPMCFullText(s) def mock_doaj(*args, **kwargs): return True workflow.get_epmc_md = mock_get_md workflow.get_epmc_fulltext = mock_get_ft workflow.doaj_lookup = mock_doaj record = models.Record() record.pmcid = "PMC4219345" record.id = record.makeid() oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.process_record(msg) assert record.confidence == 1.0 assert record.pmcid == "PMC4219345" assert record.pmid == "24279897" assert record.doi == "10.1186/1471-2121-14-52" assert record.in_epmc is True assert record.is_oa is False assert len(record.issn) == 1 assert "1471-2121" in record.issn assert record.id is not None # implies it has been saved assert record.has_ft_xml is True assert record.aam is False assert record.aam_from_xml is True assert record.licence_type == "cc-by" assert record.licence_source == "epmc_xml" assert record.journal_type == "oa" assert len(oag) == 0
def test_10_process_record_05_no_ft(self): def mock_get_md(*args, **kwargs): md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read())) return md, 1.0 def mock_get_ft(*args, **kwargs): return None def mock_doaj(*args, **kwargs): return False workflow.get_epmc_md = mock_get_md workflow.get_epmc_fulltext = mock_get_ft workflow.doaj_lookup = mock_doaj record = models.Record() record.pmcid = "PMC4219345" record.id = record.makeid() oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.process_record(msg) assert record.confidence == 1.0 assert record.pmcid == "PMC4219345" assert record.pmid == "24279897" assert record.doi == "10.1186/1471-2121-14-52" assert record.in_epmc is True assert record.is_oa is False assert len(record.issn) == 1 assert "1471-2121" in record.issn assert record.id is not None # implies it has been saved assert record.has_ft_xml is False assert record.aam is None assert record.aam_from_xml is False assert record.licence_type is None assert record.licence_source is None assert record.journal_type == "hybrid" assert len(oag) == 1 assert oag[0]["id"] == "PMC4219345" assert oag[0]["type"] == "pmcid"
def test_05_populate_identifiers(self): record = models.Record() msg = workflow.WorkflowMessage(record=record) data = json.loads(open(EPMC_MD, "r").read()) epmc_md = epmcmod.EPMCMetadata(data) workflow.populate_identifiers(msg, epmc_md) assert record.pmcid == "PMC4219345" assert record.pmid == "24279897" assert record.doi == "10.1186/1471-2121-14-52" record.pmcid = "PMC000000" record.pmid = "0000000" del record.doi workflow.populate_identifiers(msg, epmc_md) assert record.pmcid == "PMC000000" assert record.pmid == "0000000" assert record.doi == "10.1186/1471-2121-14-52"
def test_08_ft_licence(self): data = open(EPMC_FT, "r").read() xml = etree.fromstring(data) l = xml.xpath("//license") lp = l[0].find("license-p") # licence in type attribute l[0].set( "license-type", "cc by" ) # note the missing "-"; to test the licence representation variations at the same time l[0].set("{http://www.w3.org/1999/xlink}href", "http://random.url") lp.clear() s = etree.tostring(xml) ft = epmc.EPMCFullText(s) record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.extract_fulltext_licence(msg, ft) assert record.licence_type == "cc-by" assert record.licence_source == "epmc_xml" assert len(record.provenance) == 1 # licence in href attribute l[0].set("license-type", "open access") l[0].set("{http://www.w3.org/1999/xlink}href", "http://creativecommons.org/licenses/by-nd/3.0") s = etree.tostring(xml) ft = epmc.EPMCFullText(s) record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.extract_fulltext_licence(msg, ft) assert record.licence_type == "cc-by-nd" assert record.licence_source == "epmc_xml" assert len(record.provenance) == 1 # licence in text l[0].set("license-type", "open access") l[0].set("{http://www.w3.org/1999/xlink}href", "http://random.url") lp.text = "licence is <a href='http://creativecommons.org/licenses/by-nc-nd/3.0'>http://creativecommons.org/licenses/by-nc-nd/3.0</a>" s = etree.tostring(xml) ft = epmc.EPMCFullText(s) record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.extract_fulltext_licence(msg, ft) assert record.licence_type == "cc-by-nc-nd" assert record.licence_source == "epmc_xml" assert len(record.provenance) == 1 # licence in /second/ licence paragraph lp.text = "some waffle" lp2 = etree.SubElement(l[0], "license-p") lp2.text = "licence is <a href='http://creativecommons.org/licenses/by/3.0'>http://creativecommons.org/licenses/by/3.0</a>" s = etree.tostring(xml) ft = epmc.EPMCFullText(s) record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.extract_fulltext_licence(msg, ft) assert record.licence_type == "cc-by" assert record.licence_source == "epmc_xml" assert len(record.provenance) == 1 # licence in words in text l[0].set("license-type", "open access") l[0].set("{http://www.w3.org/1999/xlink}href", "http://random.url") lp.text = "This is a Creative Commons Attribution-NonCommercial licenced article" l[0].remove(lp2) s = etree.tostring(xml) ft = epmc.EPMCFullText(s) record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.extract_fulltext_licence(msg, ft) assert record.licence_type == "cc-by-nc" assert record.licence_source == "epmc_xml" assert len(record.provenance) == 1 # licence present but unrecognised lp.text = "wibble wibble wobble" s = etree.tostring(xml) ft = epmc.EPMCFullText(s) record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.extract_fulltext_licence(msg, ft) assert record.licence_type == "non-standard-licence" assert record.licence_source == "epmc_xml" assert len(record.provenance) == 1 # no licence element present p = l[0].getparent() p.remove(l[0]) s = etree.tostring(xml) ft = epmc.EPMCFullText(s) record = models.Record() msg = workflow.WorkflowMessage(record=record) workflow.extract_fulltext_licence(msg, ft) assert record.licence_type is None assert record.licence_source is None assert len(record.provenance) == 0
def test_02_send_to_oag(self): record = models.Record() # Has PMCID, AAM and Licence record.pmcid = "PMC1234" record.aam_from_xml = True record.licence_type = "CC BY" record.id = record.makeid() oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.register_with_oag(msg) assert len(oag) == 0 # Has PMCID, AAM, but no licence record.pmcid = "PMC1234" record.aam_from_xml = True del record.licence_type oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.register_with_oag(msg) assert len(oag) == 1 assert oag[0].get("id") == "PMC1234" assert oag[0].get("type") == "pmcid" # Has PMCID, not AAM, Licence record.pmcid = "PMC1234" record.aam_from_xml = False record.licence_type = "CC BY" oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.register_with_oag(msg) assert len(oag) == 1 assert oag[0].get("id") == "PMC1234" assert oag[0].get("type") == "pmcid" # Has PMCID, not AAM or Licence record.pmcid = "PMC1234" record.aam_from_xml = False del record.licence_type oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.register_with_oag(msg) assert len(oag) == 1 assert oag[0].get("id") == "PMC1234" assert oag[0].get("type") == "pmcid" # No PMCID, has DOI and Licence del record.pmcid record.aam_from_xml = False record.licence_type = "CC BY" oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.register_with_oag(msg) assert len(oag) == 0 # No PMCID, has DOI, no Licence record.doi = "10.1234" del record.licence_type oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.register_with_oag(msg) assert len(oag) == 1 assert oag[0].get("id") == "10.1234" assert oag[0].get("type") == "doi" # No PMCID or DOI, has PMID but no Licence del record.doi record.pmid = "1234" oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.register_with_oag(msg) assert len(oag) == 1 assert oag[0].get("id") == "1234" assert oag[0].get("type") == "pmid" # No identifiers or licence del record.pmid del record.licence_type oag = [] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.register_with_oag(msg) assert len(oag) == 0 # identifier which has previously been added to the run record.pmid = "1234" oag = [{"id": "1234", "type": "pmid"}] msg = workflow.WorkflowMessage(record=record, oag_register=oag) workflow.register_with_oag(msg) assert len(oag) == 1
record = models.Record() record.upload_id = job.id record.upload_pos = 1 if args.type.lower() == "pmcid": record.pmcid = args.identifier elif args.type.lower() == "pmid": record.pmid = args.identifier elif args.type.lower() == "doi": record.doi = args.identifier record.save() time.sleep(2) oag_register = [] msg = workflow.WorkflowMessage(job, record, oag_register) workflow.process_record(msg) workflow.process_oag(oag_register, job) time.sleep(2) i = 0 while True: i += 1 pcc = job.pc_complete print i, job.pc_complete, "%", sys.stdout.flush() if int(pcc) == 100: break time.sleep(2)
def test_01_get_epmc_metadata(self): record = models.Record() # an empty record, shoud result in a failure msg = workflow.WorkflowMessage(record=record) md, conf = workflow.get_epmc_md(msg) assert md is None assert conf is None # contains a pmcid that yields a result record.pmcid = PMCID_SUCCESS msg = workflow.WorkflowMessage(record=record) md, conf = workflow.get_epmc_md(msg) assert md is not None assert md.title == PMCID_SUCCESS_TITLE assert conf == 1.0 # contains a pmcid that does not yeild a result record.pmcid = PMCID_ERROR msg = workflow.WorkflowMessage(record=record) md, conf = workflow.get_epmc_md(msg) assert md is None assert conf is None # contains invalid pmcid and valid pmid record.pmcid = PMCID_ERROR record.pmid = PMID_SUCCESS msg = workflow.WorkflowMessage(record=record) md, conf = workflow.get_epmc_md(msg) assert md is not None assert md.title == PMID_SUCCESS_TITLE assert conf == 1.0 # contains an invalid pmid only del record.pmcid record.pmid = PMID_ERROR msg = workflow.WorkflowMessage(record=record) md, conf = workflow.get_epmc_md(msg) assert md is None assert conf is None # invalid pmid and valid doi record.pmid = PMID_ERROR record.doi = DOI_SUCCESS msg = workflow.WorkflowMessage(record=record) md, conf = workflow.get_epmc_md(msg) assert md is not None assert md.title == DOI_SUCCESS_TITLE assert conf == 1.0 # contains invalid doi only del record.pmid record.doi = DOI_ERROR msg = workflow.WorkflowMessage(record=record) md, conf = workflow.get_epmc_md(msg) assert md is None assert conf is None # contains an invalid doi and a title which can be matched exactly record.doi = DOI_ERROR record.title = EXACT_TITLE msg = workflow.WorkflowMessage(record=record) md, conf = workflow.get_epmc_md(msg) assert md is not None assert md.pmcid == EXACT_TITLE_PMCID assert conf < 1.0 # contains a title that can be matched fuzzily del record.doi record.title = FUZZY_TITLE msg = workflow.WorkflowMessage(record=record) md, conf = workflow.get_epmc_md(msg) assert md is not None assert md.pmcid == FUZZY_TITLE_PMCID assert conf < 1.0 # contains a title that can't be matched in any way record.title = TITLE_ERROR msg = workflow.WorkflowMessage(record=record) md, conf = workflow.get_epmc_md(msg) assert md is None assert conf is None