def test_01_read(self): ms = sheets.MasterSheet(path=TEST_SUBMISSION) objects = False for o in ms.objects(): objects = True # just check a few fields to make sure the object looks reasonable assert "university" in o assert "pmcid" in o assert "journal_title" in o assert objects
def test_05_defaults(self): s = StringIO() ms = sheets.MasterSheet(writer=s) ms.add_object({ "aam" : None, "licence" : "", "university" : "A" }) size = 0 for o in ms.objects(): size += 1 assert o.get("aam") == "unknown" assert o.get("licence") == "unknown" assert o.get("university") == "A" assert size == 1
def test_03_write_full(self): s = StringIO() ms = sheets.MasterSheet(writer=s) # add an object which conforms to the spec of the subset ms.add_object({ "university" : "A", "pmcid" : "a", "journal_title" : "1" }) # check that the record has been written size = 0 for o in ms.objects(): size += 1 assert o.get("university") == "A" assert o.get("pmcid") == "a" assert o.get("journal_title") == "1" assert o.get("doi") == "" assert len(o.keys()) == len(ms.OUTPUT_ORDER) assert size == 1 # now add an object with too much data for the spec ms.add_object({ "university" : "C", "pmcid" : "c", "something_else" : "Gamma" }) # check that the new record has been written correctly (with suitable defaults) size = 0 found = False for o in ms.objects(): size += 1 if o.get("university") == "C": found = True assert o.get("journal_title") == "" assert o.get("pmcid") == "c" assert "something_else" not in o assert size == 2 assert found
def test_04_output(self): # set up a very simple subset sheet (not it's not in the desired output order) spec = ["journal_title", "university", "pmcid"] s = StringIO() ms = sheets.MasterSheet(writer=s, spec=spec) # add an object which conforms to the spec of the subset ms.add_object({ "university" : "A", "pmcid" : "a", "journal_title" : "1" }) # output the sheet to the StringIO object ms.save() # now open the StringIO in the python standard csv reader s.seek(0) reader = csv.reader(s) rows = [row for row in reader] assert len(rows) == 2 assert rows[0] == ["University", "PMCID", "Journal title"] assert rows[1] == ["A", "a", "1"]
def parse_csv(job): app.logger.info("Loading records from " + job.id) # find out where to get the file upload = app.config.get("UPLOAD_DIR") if upload is None or upload == "": raise WorkflowException("UPLOAD_DIR is not set") path = os.path.join(upload, job.id + ".csv") # FIXME: what happens if the sheet can't be read sheet = sheets.MasterSheet(path) i = 0 for obj in sheet.objects(): i += 1 r = models.Record() r.upload_id = job.id r.upload_pos = i r.set_source_data(**obj) # also copy the various identifiers over into the locations where they can be normalised # and used for lookup if obj.get("pmcid") is not None and obj.get("pmcid") != "": npmicd = normalise_pmcid(obj.get("pmcid")) if npmicd is not None: r.pmcid = npmicd note = "normalised PMCID %(source)s to %(target)s" % { "source": obj.get("pmcid"), "target": r.pmcid } else: note = "PMCID %(source)s was syntactically invalid, so ignoring" % { "source": obj.get("pmcid") } r.add_provenance("importer", note) if obj.get("pmid") is not None and obj.get("pmid") != "": npmid = normalise_pmid(obj.get("pmid")) if npmid is not None: r.pmid = npmid note = "normalised PMID %(source)s to %(target)s" % { "source": obj.get("pmid"), "target": r.pmid } else: note = "PMID %(source)s was syntactically invalid, so ignoring" % { "source": obj.get("pmid") } r.add_provenance("importer", note) if obj.get("doi") is not None and obj.get("doi") != "": ndoi = normalise_doi(obj.get("doi")) if ndoi is not None: r.doi = ndoi note = "normalised DOI %(source)s to %(target)s" % { "source": obj.get("doi"), "target": r.doi } else: note = "DOI %(source)s was syntactically invalid, so ignoring" % { "source": obj.get("doi") } r.add_provenance("importer", note) if obj.get("article_title" ) is not None and obj.get("article_title") != "": r.title = obj.get("article_title") r.save() app.logger.info("Loaded " + str(i) + " records from spreadsheet") # FIXME: I'm not totally convinced this a/ works or b/ is a good idea # Refresh can behave quite strangely, sometimes, # refresh the index so the data is ready to use models.Record.refresh()
def output_csv(job): def serialise_provenance(r): s = "" first = True for by, when, what in r.provenance: if not first: s += "\n\n" else: first = False s += "[%(when)s %(by)s] %(what)s" % { "when": when, "by": by, "what": what } return s def objectify(r): obj = { # the identifiers "pmcid": r.pmcid, "pmid": r.pmid, "doi": r.doi, "article_title": r.title, # the results of the run "in_epmc": r.in_epmc, "xml_ft_in_epmc": r.has_ft_xml, "aam": r.aam, "open_access": r.is_oa, "licence": r.licence_type, "licence_source": r.licence_source, "journal_type": r.journal_type, "confidence": r.confidence, "standard_compliance": r.standard_compliance, "deluxe_compliance": r.deluxe_compliance, "provenance": serialise_provenance(r), "issn": ", ".join(r.issn), # this is also a result of the run, but it can be overridden by the source data # if it was passed in and not empty "journal_title": r.journal } # add the original data if present, being careful not to overwrite the data we have produced if r.source is not None: # list the fields to overwrite in the source. Note that journal_title should only be overwritten # if the source does not contain a value overwrite = obj.keys() jt = r.source.get("journal_title") if jt is not None and jt != "": overwrite.remove("journal_title") original = deepcopy(r.source) for k in overwrite: if k in original: del original[k] obj.update(original) return obj # get the records and work out what shape they are # (makes the assumption that all records have the same spec, which /should/ be true) records = models.Record.list_by_upload(job.id) spec = objectify(records[0]) # create a master spreadsheet with the right shape s = StringIO() sheet = sheets.MasterSheet(writer=s, spec=spec.keys()) # for each record, objectify it and add to the sheet for r in records: assert isinstance(r, models.Record) obj = objectify(r) sheet.add_object(obj) sheet.save() return s.getvalue()
def test_02_write_subset(self): # set up a very simple subset sheet spec = ["university", "pmcid", "journal_title"] s = StringIO() ms = sheets.MasterSheet(writer=s, spec=spec) # add an object which conforms to the spec of the subset ms.add_object({ "university" : "A", "pmcid" : "a", "journal_title" : "1" }) # check that the record has been written size = 0 for o in ms.objects(): size += 1 assert o.get("university") == "A" assert o.get("pmcid") == "a" assert o.get("journal_title") == "1" assert len(o.keys()) == 3 assert size == 1 # now add an object with insufficient data for all columns ms.add_object({ "university" : "B", "journal_title" : "2", }) # check that the new record has been written correctly (with suitable defaults) size = 0 found = False for o in ms.objects(): size += 1 if o.get("university") == "B": found = True assert o.get("journal_title") == "2" assert o.get("pmcid") == "" assert size == 2 assert found # now add an object with too much data for the spec ms.add_object({ "university" : "C", "pmcid" : "c", "something_else" : "Gamma" }) # check that the new record has been written correctly (with suitable defaults) size = 0 found = False for o in ms.objects(): size += 1 if o.get("university") == "C": found = True assert o.get("journal_title") == "" assert o.get("pmcid") == "c" assert "something_else" not in o assert size == 3 assert found
def test_06_blank_rows(self): ms = sheets.MasterSheet(path=BLANK_LINES) objects = [o for o in ms.objects()] assert len(objects) == 20