def test_04_process_oag(self): job = models.SpreadsheetJob() job.save() oag_register = [{ "id": "PMC1234", "type": "pmcid" }, { "id": "10.1234", "type": "doi" }, { "id": "10.5678", "type": "doi" }, { "id": "abcd", "type": "pmid" }] workflow.process_oag(oag_register, job) time.sleep(2) link = models.OAGRLink.by_spreadsheet_id(job.id) assert link is not None assert link.spreadsheet_id == job.id assert link.oagrjob_id is not None oj = oagr.dao.JobsDAO.pull(link.oagrjob_id) assert oj is not None state = oj.state() assert len(state.pending) == 4
def test_02_parse_csv(self): s = models.SpreadsheetJob() s.filename = "test_submission.csv" s.contact_email = "*****@*****.**" s.status_code = "submitted" s.id = "test_submission" s.save() workflow.parse_csv(s)
def test_03_handle_oag_response_01_pmcid_success(self): # first make ourselves a job/record that we want to enhance job = models.SpreadsheetJob() job.save() record = models.Record() record.upload_id = job.id record.pmcid = "PMC1234" record.save() time.sleep(2) # construct the OAG response object, which has detected a licence oag_result = { "identifier": [{ "id": "PMC1234", "type": "pmcid" }], "license": [{ "type": "cc-by", "provenance": { "accepted_author_manuscript": True, # FIXME: provisional "description": "Provenance PMC1234" } }] } # call the oag record callback oag_rerun = [] workflow.oag_record_callback(oag_result, oag_rerun, job) # should not have added anything to the rerun assert len(oag_rerun) == 0 # give the index a moment to catch up time.sleep(2) r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next() assert isinstance(r2, models.Record) assert r2.id == record.id assert r2.pmcid == "PMC1234" # licence added, source=epmc, pmcid=success, provenance added, aam set assert r2.licence_type == "cc-by" assert r2.licence_source == "epmc" assert r2.oag_pmcid == "success" assert r2.aam_from_epmc is True assert r2.aam is True provs = [n for b, w, n in r2.provenance] assert len(provs) == 2 assert "PMC1234 - Provenance PMC1234" in provs assert "Detected AAM status from EPMC web page" in provs
def test_03_handle_oag_response_02_pmcid_fto(self): # first make ourselves a record that we want to enhance job = models.SpreadsheetJob() job.save() record = models.Record() record.upload_id = job.id record.pmcid = "PMC1234" record.doi = "10.1234" record.save() time.sleep(2) # construct the OAG response object, which has detected a licence oag_result = { "identifier": [{ "id": "PMC1234", "type": "pmcid" }], "license": [{ "type": "failed-to-obtain-license", "provenance": { "accepted_author_manuscript": True, # FIXME: provisional "description": "FTO PMC1234" } }] } # call the oag record callback oag_rerun = [] workflow.oag_record_callback(oag_result, oag_rerun, job) # should have added the DOI to the re-run assert len(oag_rerun) == 1 assert oag_rerun[0]["id"] == "10.1234" assert oag_rerun[0]["type"] == "doi" # give the index a moment to catch up time.sleep(2) r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next() assert isinstance(r2, models.Record) # provenance added, pmcid=fto, aam set assert r2.licence_type is None assert r2.oag_pmcid == "fto" assert r2.aam_from_epmc is True assert r2.aam is True provs = [n for b, w, n in r2.provenance] assert len(provs) == 2 assert "PMC1234 - FTO PMC1234" in provs assert "Detected AAM status from EPMC web page" in provs
def test_03_handle_oag_response_04_pmcid_no_change(self): # first make ourselves a record that we want to enhance job = models.SpreadsheetJob() job.save() record = models.Record() record.upload_id = job.id record.pmcid = "PMC1234" record.licence_type = "CC BY" record.aam = True record.aam_from_xml = True record.save() time.sleep(2) # construct the OAG response object, which has detected a licence oag_result = { "identifier": [{ "id": "PMC1234", "type": "pmcid" }], "license": [{ "type": "failed-to-obtain-license", "provenance": { "accepted_author_manuscript": False, # FIXME: provisional "description": "You won't see this PMC1234" } }] } # call the oag record callback oag_rerun = [] workflow.oag_record_callback(oag_result, oag_rerun, job) # should not have added anything to the rerun assert len(oag_rerun) == 0 # give the index a moment to catch up time.sleep(2) r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next() assert isinstance(r2, models.Record) # expecting no changes assert r2.licence_type == "CC BY" assert r2.licence_source is None assert r2.oag_pmcid is None assert r2.aam_from_epmc is False assert r2.aam is True provs = [n for b, w, n in r2.provenance] assert len(provs) == 0
def test_03_handle_oag_response_07_doi_fto(self): # first make ourselves a record that we want to enhance job = models.SpreadsheetJob() job.save() record = models.Record() record.upload_id = job.id record.doi = "10.1234" record.pmid = "1234" record.save() time.sleep(2) # construct the OAG response object, which has detected a licence oag_result = { "identifier": [{ "id": "10.1234", "type": "doi" }], "license": [{ "type": "failed-to-obtain-license", "provenance": { "description": "FTO 10.1234" } }] } # call the oag record callback oag_rerun = [] workflow.oag_record_callback(oag_result, oag_rerun, job) # should have added the DOI to the re-run assert len(oag_rerun) == 1 assert oag_rerun[0]["id"] == "1234" assert oag_rerun[0]["type"] == "pmid" # give the index a moment to catch up time.sleep(2) r2 = models.Record.get_by_identifier("10.1234", job.id).next() assert isinstance(r2, models.Record) # provenance added, doi=fto, pmid reprocess assert r2.licence_type is None assert r2.oag_doi == "fto" assert r2.aam is None provs = [n for b, w, n in r2.provenance] assert len(provs) == 1 assert "10.1234 - FTO 10.1234" in provs
def test_03_handle_oag_response_06_doi_success(self): # first make ourselves a record that we want to enhance job = models.SpreadsheetJob() job.save() record = models.Record() record.upload_id = job.id record.doi = "10.1234" record.save() time.sleep(2) # construct the OAG response object, which has detected a licence oag_result = { "identifier": [{ "id": "10.1234", "type": "doi" }], "license": [{ "type": "cc-by", "provenance": { "description": "Provenance 10.1234" } }] } # call the oag record callback oag_rerun = [] workflow.oag_record_callback(oag_result, oag_rerun, job) # should not have added anything to the rerun assert len(oag_rerun) == 0 # give the index a moment to catch up time.sleep(2) r2 = models.Record.get_by_identifier( "10.1234", job.id).next() # leave out the "doi" type just for the hell of it assert isinstance(r2, models.Record) # licence added, source=publisher, doi=success, provenance added assert r2.licence_type == "cc-by" assert r2.licence_source == "publisher" assert r2.oag_doi == "success" assert r2.aam is None provs = [n for b, w, n in r2.provenance] assert len(provs) == 1 assert "10.1234 - Provenance 10.1234" in provs
def test_11_oag_callback_02_finished(self): cb = workflow.oag_callback_closure() job = models.SpreadsheetJob() job.save() state = oagclient.RequestState(["PMC1234", "PMC9876"], max_retries=1) state.record_requested(["PMC1234", "PMC9876"]) oagrlink = models.OAGRLink() oagrlink.spreadsheet_id = job.id oagrlink.oagrjob_id = state.id oagrlink.save() record = models.Record() record.upload_id = job.id record.pmcid = "PMC1234" record.save() record = models.Record() record.upload_id = job.id record.pmcid = "PMC9876" record.save() time.sleep(2) cb("finished", state) time.sleep(2) r1 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next() r2 = models.Record.get_by_identifier("PMC9876", job.id, "pmcid").next() assert r1.in_oag is False assert len(r1.provenance) == 1 assert r1.provenance[0][2].startswith( "Attempted to retrieve PMC1234 1") assert r1.oag_pmcid == "error" assert r1.oag_complete is True assert r2.in_oag is False assert r2.oag_pmcid == "error" assert len(r2.provenance) == 1 assert r2.provenance[0][2].startswith( "Attempted to retrieve PMC9876 1") assert r2.oag_complete is True
def test_03_handle_oag_response_05_pmcid_error(self): # first make ourselves a record that we want to enhance job = models.SpreadsheetJob() job.save() record = models.Record() record.upload_id = job.id record.pmcid = "PMC1234" record.pmid = "1234" record.save() time.sleep(2) # construct the OAG response object, which has detected a licence oag_result = { "identifier": { "id": "PMC1234", "type": "pmcid" }, "error": "broken!" } # call the oag record callback oag_rerun = [] workflow.oag_record_callback(oag_result, oag_rerun, job) # should have added the PMID to the re-run assert len(oag_rerun) == 1 assert oag_rerun[0]["id"] == "1234" assert oag_rerun[0]["type"] == "pmid" # give the index a moment to catch up time.sleep(2) r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next() assert isinstance(r2, models.Record) # provenance added, pmcid=error, pmid reprocess assert r2.licence_type is None assert r2.oag_pmcid == "error" provs = [n for b, w, n in r2.provenance] assert len(provs) == 1 assert "PMC1234 - broken!" in provs
def csv_upload(flask_file_handle, filename, contact_email): # make a record of the upload s = models.SpreadsheetJob() s.filename = filename s.contact_email = contact_email s.status_code = "submitted" s.id = s.makeid() # find out where to put the file upload = app.config.get("UPLOAD_DIR") if upload is None or upload == "": raise WorkflowException("UPLOAD_DIR is not set") # save the file and the record of the upload flask_file_handle.save(os.path.join(upload, s.id + ".csv")) s.save() # return the job that was created, in case the caller wants to do something with it return s
def test_15_record_maxed_02_no_match(self): job = models.SpreadsheetJob() job.save() record = models.Record() record.pmcid = "PMC1234" record.upload_id = job.id record.save() time.sleep(2) oag_maxed = {"requested": 20, "init": "2001-01-01T09:30:00Z"} oag_rerun = [] workflow.record_maxed("PMC9876", oag_maxed, job, oag_rerun) time.sleep(2) record = models.Record.pull(record.id) assert record.oag_complete is False assert len(record.provenance) == 0
def test_12_licence_translate(self): assert workflow.translate_licence_type( "free-to-read") == "non-standard-licence" # first make ourselves a record that we want to enhance job = models.SpreadsheetJob() job.save() record = models.Record() record.upload_id = job.id record.pmcid = "PMC1234" record.save() time.sleep(2) # construct the OAG response object, which has detected a licence oag_result = { "identifier": [{ "id": "PMC1234", "type": "pmcid" }], "license": [{ "type": "free-to-read", "provenance": { "accepted_author_manuscript": False, # FIXME: provisional "description": "FtR PMC1234" } }] } oag_rerun = [] workflow.oag_record_callback(oag_result, oag_rerun, job) # give the index a moment to catch up time.sleep(2) r2 = models.Record.get_by_identifier("PMC1234", job.id).next() assert isinstance(r2, models.Record) assert r2.licence_type == "non-standard-licence"
def test_14_oag_record_callback_duplicate(self): # first make ourselves a job/record that we want to enhance job = models.SpreadsheetJob() job.save() # make two distinct records with the same ids record = models.Record() record.upload_id = job.id record.pmcid = "PMC1234" record.save() record = models.Record() record.upload_id = job.id record.pmcid = "PMC1234" record.save() time.sleep(2) # construct the OAG response object, which has detected a licence oag_result = { "identifier": [{ "id": "PMC1234", "type": "pmcid" }], "license": [{ "type": "cc-by", "provenance": { "accepted_author_manuscript": True, "description": "Provenance PMC1234" } }] } # call the oag record callback oag_rerun = [] workflow.oag_record_callback(oag_result, oag_rerun, job) # give the index a moment to catch up time.sleep(2) # read the duplicate records out of the index records = [ r for r in models.Record.get_by_identifier("PMC1234", job.id, "pmcid") ] # there should be 2 of them assert len(records) == 2 for record in records: assert isinstance(r, models.Record) # both records should have the same data # licence added, source=epmc, pmcid=success, provenance added, aam set assert record.licence_type == "cc-by" assert record.licence_source == "epmc" assert record.oag_pmcid == "success" assert record.aam_from_epmc is True assert record.aam is True provs = [n for b, w, n in record.provenance] assert len(provs) == 2 assert "PMC1234 - Provenance PMC1234" in provs assert "Detected AAM status from EPMC web page" in provs assert record.oag_complete is True
def test_13_duplicate_check(self): # first make ourselves a job to work on job = models.SpreadsheetJob() job.save() # now make a bunch of records, some unique and some duplicate # unique pmcid r = models.Record() r.upload_id = job.id r.pmcid = "PMCunique" r.save() # duplicate pmcid r = models.Record() r.upload_id = job.id r.pmcid = "PMCdupe" r.save() r = models.Record() r.upload_id = job.id r.pmcid = "PMCdupe" r.save() # unique pmid r = models.Record() r.upload_id = job.id r.pmid = "unique" r.save() # duplicate pmid r = models.Record() r.upload_id = job.id r.pmid = "dupe" r.save() r = models.Record() r.upload_id = job.id r.pmid = "dupe" r.save() # unique doi r = models.Record() r.upload_id = job.id r.doi = "10.unique" r.save() # duplicate pmcid r = models.Record() r.upload_id = job.id r.doi = "10.dupe" r.save() r = models.Record() r.upload_id = job.id r.doi = "10.dupe" r.save() # one that is a duplicate of everything r = models.Record() r.upload_id = job.id r.pmcid = "PMCdupe" r.pmid = "dupe" r.doi = "10.dupe" r.save() # one that is confused about its duplication r = models.Record() r.upload_id = job.id r.pmcid = "PMCdupe" r.pmid = "dupe" r.doi = "10.notdupe" r.save() time.sleep(2) workflow.duplicate_check(job) time.sleep(2) # for each record, check that it got the provenance # unique pmcid - no provenance, one result unique = models.Record.get_by_identifier("PMCunique", job.id, "pmcid") ulen = 0 for u in unique: ulen += 1 assert len(u.provenance) == 0 assert ulen == 1 # unique pmid - no provenance, one result unique = models.Record.get_by_identifier("unique", job.id, "pmid") ulen = 0 for u in unique: ulen += 1 assert len(u.provenance) == 0 assert ulen == 1 # unique doi - no provenance, one result unique = models.Record.get_by_identifier("10.unique", job.id, "doi") ulen = 0 for u in unique: ulen += 1 assert len(u.provenance) == 0 assert ulen == 1 # duplicates of pmcdupe duped = models.Record.get_by_identifier("PMCdupe", job.id, "pmcid") dlen = 0 for u in duped: dlen += 1 prov = False for p in u.provenance: if "PMCID" in p[2]: prov = True break assert prov assert dlen == 4 # duplicates of pmid dupe duped = models.Record.get_by_identifier("dupe", job.id, "pmid") dlen = 0 for u in duped: dlen += 1 prov = False for p in u.provenance: if "PMID" in p[2]: prov = True break assert prov assert dlen == 4 # duplicates of 10.dupe duped = models.Record.get_by_identifier("10.dupe", job.id, "doi") dlen = 0 for u in duped: dlen += 1 prov = False for p in u.provenance: if "DOI" in p[2]: prov = True break assert prov assert dlen == 3
def test_11_oag_callback_01_cycle(self): cb = workflow.oag_callback_closure() assert cb is not None import types assert type(cb) == types.FunctionType job = models.SpreadsheetJob() job.save() state = oagclient.RequestState(["PMC1234", "PMC9876"]) oag_response = { "results": [{ "identifier": [{ "id": "PMC1234", "type": "epmc", "canonical": "PMC1234" }], "license": [{ "type": "cc-by", "provenance": { "description": "SUCCESS" } }] }], "errors": [{ "identifier": { "id": "PMC9876", "type": "epmc", "canonical": "PMC9876" }, "error": "ERROR" }] } state.record_result(oag_response) oagrlink = models.OAGRLink() oagrlink.spreadsheet_id = job.id oagrlink.oagrjob_id = state.id oagrlink.save() record = models.Record() record.upload_id = job.id record.pmcid = "PMC1234" record.save() record = models.Record() record.upload_id = job.id record.pmcid = "PMC9876" record.save() time.sleep(2) cb("cycle", state) time.sleep(2) r1 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next() r2 = models.Record.get_by_identifier("PMC9876", job.id, "pmcid").next() assert r1.in_oag is False assert len(r1.provenance) == 1 assert "SUCCESS" in r1.provenance[0][2] assert r1.oag_pmcid == "success" assert r1.licence_source == "epmc" assert r1.licence_type == "cc-by" assert r1.oag_complete is True assert r2.in_oag is False assert r2.oag_pmcid == "error" assert len(r2.provenance) == 1 assert "ERROR" in r2.provenance[0][2] assert r2.oag_complete is True
def test_01_export(self): # make a job - we don't much care about its content for this test job = models.SpreadsheetJob() job.save() now = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ") # make a few records for it # all fields filled in correctly r1 = models.Record() r1.pmcid = "PMC1234" r1.pmid = "1234" r1.doi = "10.1234" r1.title = "The Title" r1.has_ft_xml = True r1.in_epmc = True r1.aam = True r1.is_oa = True r1.licence_type = "CC0" r1.licence_source = "publisher" r1.journal_type = "hybrid" r1.confidence = 0.9 r1.add_provenance("test", "provenance", now) r1.upload_id = job.id r1.upload_pos = 1 r1.journal = "Journal of Science" r1.issn = ["1234-5678", "9876-5432"] r1.save() r2 = models.Record() r2.pmcid = "PMC9876" r2.upload_id = job.id r2.upload_pos = 2 r2.save() r3 = models.Record() r3.pmid = "9876" r3.upload_id = job.id r3.upload_pos = 3 r3.title = None r3.licence_type = "" r3.add_provenance("test", "provenance", now) r3.add_provenance("test", "more", now) r3.save() # refresh the index ready for querying models.SpreadsheetJob.refresh() models.Record.refresh() out = workflow.output_csv(job) s = StringIO(out) reader = csv.reader(s) rows = [r for r in reader] assert len(rows) == 4 assert rows[0] == [ 'PMCID', 'PMID', 'DOI', "Journal title", "ISSN", 'Article title', "Fulltext in EPMC?", 'XML Fulltext?', 'AAM?', 'Open Access?', 'Licence', 'Licence Source', 'Journal Type', 'Correct Article Confidence', 'Standard Compliance?', 'Deluxe Compliance?', 'Compliance Processing Ouptut' ] assert rows[1] == [ 'PMC1234', '1234', '10.1234', "Journal of Science", "1234-5678, 9876-5432", 'The Title', "True", 'True', 'True', 'True', 'CC0', 'publisher', 'hybrid', '0.9', "True", "True", '[' + now + ' test] provenance' ] assert rows[2] == [ "PMC9876", "", "", "", "", "", "", "", "unknown", "", "unknown", "", "", "", "False", "False", "" ] assert rows[3] == [ "", "9876", "", "", "", "", "", "", "unknown", "", "unknown", "", "", "", "False", "False", '[' + now + ' test] provenance\n\n[' + now + ' test] more' ]
parser.add_argument("-i", "--identifier", help="identifier to run through the system") args = parser.parse_args() if args.identifier is None or args.type is None: parser.print_help() exit() if args.type.lower() not in ["pmcid", "pmid", "doi"]: print "Type must be one of pmcid, pmid or doi" parser.print_help() exit() # we must create a job with a single record for it to be run job = models.SpreadsheetJob() job.contact_email = "*****@*****.**" job.save() record = models.Record() record.upload_id = job.id record.upload_pos = 1 if args.type.lower() == "pmcid": record.pmcid = args.identifier elif args.type.lower() == "pmid": record.pmid = args.identifier elif args.type.lower() == "doi": record.doi = args.identifier record.save()