def addOrganSpecificInformation(self, biomarkers, statements, normalizer, catalog): '''Populate biomarkers with body system (aka "organ") details.''' for uri, predicates in statements.items(): try: if predicates[_typeURI][0] != _bmOrganDataTypeURI: continue biomarker = biomarkers[predicates[_biomarkerPredicateURI][0]] except KeyError: continue organName = unicode(predicates[_organPredicateURI][0]) results = catalog(Title=organName, object_provides=IBodySystem.__identifier__) if len(results) < 1: _logger.warn('Unknown organ %s for biomarker %s', organName, biomarker.title) continue organObjID = normalizer(organName) biomarkerBodySystem = biomarker[biomarker.invokeFactory('Biomarker Body System', organObjID)] biomarkerBodySystem.setTitle(results[0].Title) biomarkerBodySystem.setBodySystem(results[0].UID) updateObject(biomarkerBodySystem, uri, predicates, catalog) self.addBiomarkerToOrganGroup(biomarker, organName, catalog) if _hasBiomarkerOrganStudyDatasPredicateURI in predicates: bags = predicates[_hasBiomarkerOrganStudyDatasPredicateURI] self.addStudiesToOrgan(biomarkerBodySystem, bags, statements, normalizer, catalog) certificationURIs = predicates.get(_certificationPredicateURI, []) # TODO: make a separate Certification type so we don't rely on these fixed values. # Although we'll likely never have ohter certifications. for certificationURI in certificationURIs: if certificationURI == _cliaCertificationURI: biomarkerBodySystem.cliaCertification = True elif certificationURI == _fdaCeritificationURI: biomarkerBodySystem.fdaCertification = True biomarkerBodySystem.reindexObject()
def updateBiomarker(self, obj, uri, predicates, context, statements): '''Update a biomarker. Sets various attributes and then adjusts workflow & security.''' updateObject(obj, uri, predicates, context) if _accessPredicateURI in predicates: groupIDs = [unicode(i) for i in predicates[_accessPredicateURI]] obj.accessGroups = groupIDs settings = [ dict(type='group', roles=[u'Reader'], id=i) for i in groupIDs ] sharing = getMultiAdapter((obj, TestRequest()), name=u'sharing') sharing.update_role_settings(settings) if _hasBiomarkerStudyDatasPredicateURI in predicates: catalog = getToolByName(context, 'portal_catalog') protocolUIDs, piUIDs = [], [] bag = statements[predicates[_hasBiomarkerStudyDatasPredicateURI] [0]] for subjectURI, objects in bag.iteritems(): if subjectURI == _typeURI: continue # Assume anything else is a list item pointing to BiomarkerStudyData objects for bmsd in [statements[i] for i in objects]: # Right now, we use just the "referencesStudy" predicate, if it's present if _referencesStudyPredicateURI not in bmsd: continue results = catalog(identifier=unicode( bmsd[_referencesStudyPredicateURI][0]), object_provides=IProtocol.__identifier__) protocolUIDs.extend([j.UID for j in results]) piUIDs.extend([j.piUID for j in results]) for k in [j.getObject() for j in results]: self._addBiomarkerToProtocol(obj, k) obj.setProtocols(protocolUIDs) obj.setPiUIDs(piUIDs)
def updateBiomarker(self, obj, uri, predicates, context, statements): '''Update a biomarker. Sets various attributes and then adjusts workflow & security.''' updateObject(obj, uri, predicates, context) if _accessPredicateURI in predicates: groupIDs = [unicode(i) for i in predicates[_accessPredicateURI]] obj.accessGroups = groupIDs settings = [dict(type='group', roles=[u'Reader'], id=i) for i in groupIDs] sharing = getMultiAdapter((obj, TestRequest()), name=u'sharing') sharing.update_role_settings(settings) if _hasBiomarkerStudyDatasPredicateURI in predicates: catalog = getToolByName(context, 'portal_catalog') protocolUIDs, piUIDs = [], [] bag = statements[predicates[_hasBiomarkerStudyDatasPredicateURI][0]] for subjectURI, objects in bag.iteritems(): if subjectURI == _typeURI: continue # Assume anything else is a list item pointing to BiomarkerStudyData objects for bmsd in [statements[i] for i in objects]: # Right now, we use just the "referencesStudy" predicate, if it's present if _referencesStudyPredicateURI not in bmsd: continue results = catalog( identifier=unicode(bmsd[_referencesStudyPredicateURI][0]), object_provides=IProtocol.__identifier__ ) protocolUIDs.extend([j.UID for j in results]) piUIDs.extend([j.piUID for j in results]) for k in [j.getObject() for j in results]: self._addBiomarkerToProtocol(obj, k) obj.setProtocols(protocolUIDs) obj.setPiUIDs(piUIDs)
def __call__(self): """We have to override this because ECAS datasets come in with unpredictable RDF types. """ context = aq_inner(self.context) catalog = getToolByName(context, "portal_catalog") wfTool = getToolByName(context, "portal_workflow") rdfDataSource, dsSumDataSource = context.rdfDataSource, context.dsSumDataSource if dsSumDataSource: context.dataSummary = self.getSummaryData(dsSumDataSource) if not rdfDataSource: raise RDFIngestException(_(u"This folder has no RDF data source URL.")) normalizerFunction = queryUtility(IIDNormalizer).normalize graph = ConjunctiveGraph() graph.parse(URLInputSource(rdfDataSource)) statements = self._parseRDF(graph) createdObjects = [] for uri, predicates in statements.items(): objectID = self._generateID(uri, predicates, normalizerFunction) title = self._generateTitle(uri, predicates) if objectID in context.objectIds(): dataset = context[objectID] else: dataset = context[context.invokeFactory("Dataset", objectID)] dataset.setTitle(title) updateObject(dataset, uri, predicates, context) createdObjects.append(CreatedObject(dataset)) if _collaborativeGroupURI in predicates: self.updateCollaborativeGroup(dataset, unicode(predicates[_collaborativeGroupURI][0]), catalog) if _protocolPredicateURI in predicates: for proto in [ i.getObject() for i in catalog(identifier=[unicode(i) for i in predicates[_protocolPredicateURI]]) ]: uid = dataset.UID() current = [i.UID() for i in proto.datasets] if uid not in current: current.append(uid) proto.setDatasets(current) proto.datasetNames = proto._computeDatasetNames() proto.reindexObject() if _accessPredicateURI in predicates: groupIDs = [unicode(i) for i in predicates[_accessPredicateURI]] dataset.accessGroups = groupIDs settings = [dict(type="group", roles=[u"Reader"], id=i) for i in groupIDs] sharing = getMultiAdapter((dataset, TestRequest()), name=u"sharing") sharing.update_role_settings(settings) if _organPredicateURI in predicates: bodySystemName = predicates[_organPredicateURI][0].rsplit("/", 1)[-1] organs = self.getBodySysteObj(catalog, bodySystemName) if len(organs) > 0: dataset.setBodySystem(organs[0]) self.publishDataset(wfTool, dataset, predicates) dataset.reindexObject() self.objects = createdObjects return self.render and self.template() or len(self.objects)
def _ingestPeople(self): context = aq_inner(self.context) catalog = getToolByName(context, 'portal_catalog') normalizerFunction = queryUtility(IIDNormalizer).normalize graph = ConjunctiveGraph() graph.parse(URLInputSource(context.peopleDataSource)) statements = self._parseRDF(graph) createdPeople = {} for uri, predicates in statements.items(): persons = catalog(identifier=unicode(uri), object_provides=IPerson.__identifier__) person_list = [p.id for p in persons] employmentStatus = self._getNameComponent(_employmentActiveURI, predicates) if _siteURI not in predicates: # Person without a site, ignore him or her. continue siteURIs = [unicode(i) for i in predicates[_siteURI]] results = catalog(object_provides=ISite.__identifier__, identifier=siteURIs) if len(results) == 0: # Person with a site, but it's unknown, so again, ignore him or her. continue for site in [i.getObject() for i in results]: if employmentStatus == "Former employee": # Person who is not an current employee anymore # make sure to delete them if they exist if len(persons) > 0: for pid in person_list: if pid in site.objectIds(): site.manage_delObjects(pid) continue objectID = self._generatePersonID(predicates, normalizerFunction) if objectID in site.objectIds(): site.manage_delObjects(objectID) person = site[site.invokeFactory('Person', objectID)] updateObject(person, uri, predicates, context) person.siteName = site.title person.setDescription(u'Staff, %s, %s' % (safe_unicode(person.siteName), safe_unicode(person.phone))) person.memberType = site.memberType degrees = [] for degreePredicateURI in [URIRef(_degreePredicateURIPrefix + unicode(i)) for i in range(1, 4)]: if degreePredicateURI in predicates: degree = unicode(predicates[degreePredicateURI][0]) degree.strip() if degree: degrees.append(degree) person.setDegrees(degrees) if _photoPredicateURI in predicates: url = predicates[_photoPredicateURI][0] contentType = mimetypes.guess_type(url)[0] or 'image/gif' try: with contextlib.closing(urllib2.urlopen(url)) as con: field = person.schema['image'] field.set(person, con.read(), content_type=contentType, mimetype=contentType) except urllib2.HTTPError: pass person.reindexObject() createdPeople[(site.identifier, uri)] = person return createdPeople
def addStudiesToOrgan(self, biomarkerBodySystem, bags, statements, normalizer, catalog): '''Add protocol/study-specific information to a biomarker body system.''' # Gather all the URIs bmStudyDataURIs = [] # The RDF may contain an empty <hasBiomarkerStudyDatas/>, which means that # there will be just an empty Literal '' in the bags list (which will be a # one item list). In that case, don't bother adding studies. if len(bags) == 1 and unicode(bags[0]) == u'': return for bag in bags: preds = statements[bag] del preds[_typeURI] bmStudyDataURIs.extend(flatten(preds.values())) for studyURI in bmStudyDataURIs: bmStudyDataPredicates = statements[studyURI] if _referencesStudyPredicateURI not in bmStudyDataPredicates: continue studies = self.findObjectsByIdentifiers(catalog, [ unicode(i) for i in bmStudyDataPredicates[_referencesStudyPredicateURI] ]) if len(studies) < 1: _logger.warn( 'Study "%s" not found for biomarker body system "%r"', bmStudyDataPredicates[_referencesStudyPredicateURI][0], biomarkerBodySystem.identifier) continue identifier = str( studies[0].identifier.split('/')[-1]) + '-' + normalizer( studies[0].title) bodySystemStudy = None if identifier not in biomarkerBodySystem.keys(): bodySystemStudy = biomarkerBodySystem[ biomarkerBodySystem.invokeFactory('Body System Study', identifier)] else: bodySystemStudy = biomarkerBodySystem[identifier] updateObject(bodySystemStudy, studyURI, bmStudyDataPredicates, catalog) bodySystemStudy.title = studies[0].title bodySystemStudy.description = studies[0].description bodySystemStudy.setProtocol(studies[0].UID()) self._addBiomarkerToProtocol( aq_parent(aq_inner(aq_parent(aq_inner(bodySystemStudy)))), studies[0]) if _sensitivityDatasPredicateURI in bmStudyDataPredicates: bags = bmStudyDataPredicates[_sensitivityDatasPredicateURI] self.addStatistics(bodySystemStudy, bags, statements, normalizer, catalog) bodySystemStudy.reindexObject()
def addStatistics(self, bodySystemStudy, bags, statements, normalizer, catalog): '''Add study statistics to a body system study. The bags are RDF-style collections of URIRefs to statistics found in the statements.''' # Gather all the URIs sensitivityURIs = [] for bag in bags: preds = statements[bag] del preds[_typeURI] sensitivityURIs.extend(flatten(preds.values())) # For each set of statistics... for sensitivityURI in sensitivityURIs: predicates = statements[sensitivityURI] stats = bodySystemStudy[bodySystemStudy.invokeFactory('Study Statistics', uuid.uuid1())] updateObject(stats, sensitivityURI, predicates, catalog) stats.title = sensitivityURI stats.reindexObject()
def addStatistics(self, bodySystemStudy, bags, statements, normalizer, catalog): '''Add study statistics to a body system study. The bags are RDF-style collections of URIRefs to statistics found in the statements.''' # Gather all the URIs sensitivityURIs = [] for bag in bags: preds = statements[bag] del preds[_typeURI] sensitivityURIs.extend(flatten(preds.values())) # For each set of statistics... for sensitivityURI in sensitivityURIs: predicates = statements[sensitivityURI] stats = bodySystemStudy[bodySystemStudy.invokeFactory( 'Study Statistics', uuid.uuid1())] updateObject(stats, sensitivityURI, predicates, catalog) stats.title = sensitivityURI stats.reindexObject()
def addStudiesToOrgan(self, biomarkerBodySystem, bags, statements, normalizer, catalog): '''Add protocol/study-specific information to a biomarker body system.''' # Gather all the URIs bmStudyDataURIs = [] # The RDF may contain an empty <hasBiomarkerStudyDatas/>, which means that # there will be just an empty Literal '' in the bags list (which will be a # one item list). In that case, don't bother adding studies. if len(bags) == 1 and unicode(bags[0]) == u'': return for bag in bags: preds = statements[bag] del preds[_typeURI] bmStudyDataURIs.extend(flatten(preds.values())) for studyURI in bmStudyDataURIs: bmStudyDataPredicates = statements[studyURI] if _referencesStudyPredicateURI not in bmStudyDataPredicates: continue studies = self.findObjectsByIdentifiers(catalog, [unicode(i) for i in bmStudyDataPredicates[_referencesStudyPredicateURI]]) if len(studies) < 1: _logger.warn('Study "%s" not found for biomarker body system "%r"', bmStudyDataPredicates[_referencesStudyPredicateURI][0], biomarkerBodySystem.identifier ) continue identifier = str(studies[0].identifier.split('/')[-1]) + '-' + normalizer(studies[0].title) bodySystemStudy = None if identifier not in biomarkerBodySystem.keys(): bodySystemStudy = biomarkerBodySystem[biomarkerBodySystem.invokeFactory('Body System Study', identifier)] else: bodySystemStudy = biomarkerBodySystem[identifier] updateObject(bodySystemStudy, studyURI, bmStudyDataPredicates, catalog) bodySystemStudy.title = studies[0].title bodySystemStudy.description = studies[0].description bodySystemStudy.setProtocol(studies[0].UID()) self._addBiomarkerToProtocol(aq_parent(aq_inner(aq_parent(aq_inner(bodySystemStudy)))), studies[0]) if _sensitivityDatasPredicateURI in bmStudyDataPredicates: bags = bmStudyDataPredicates[_sensitivityDatasPredicateURI] self.addStatistics(bodySystemStudy, bags, statements, normalizer, catalog) bodySystemStudy.reindexObject()
def addOrganSpecificInformation(self, biomarkers, statements, normalizer, catalog): '''Populate biomarkers with body system (aka "organ") details.''' for uri, predicates in statements.items(): try: if predicates[_typeURI][0] != _bmOrganDataTypeURI: continue biomarker = biomarkers[predicates[_biomarkerPredicateURI][0]] except KeyError: continue organName = unicode(predicates[_organPredicateURI][0]) results = catalog(Title=organName, object_provides=IBodySystem.__identifier__) if len(results) < 1: _logger.warn('Unknown organ %s for biomarker %s', organName, biomarker.title) continue organObjID = normalizer(organName) biomarkerBodySystem = biomarker[biomarker.invokeFactory( 'Biomarker Body System', organObjID)] biomarkerBodySystem.setTitle(results[0].Title) biomarkerBodySystem.setBodySystem(results[0].UID) updateObject(biomarkerBodySystem, uri, predicates, catalog) self.addBiomarkerToOrganGroup(biomarker, organName, catalog) if _hasBiomarkerOrganStudyDatasPredicateURI in predicates: bags = predicates[_hasBiomarkerOrganStudyDatasPredicateURI] self.addStudiesToOrgan(biomarkerBodySystem, bags, statements, normalizer, catalog) certificationURIs = predicates.get(_certificationPredicateURI, []) # TODO: make a separate Certification type so we don't rely on these fixed values. # Although we'll likely never have ohter certifications. for certificationURI in certificationURIs: if certificationURI == _cliaCertificationURI: biomarkerBodySystem.cliaCertification = True elif certificationURI == _fdaCeritificationURI: biomarkerBodySystem.fdaCertification = True biomarkerBodySystem.reindexObject()
def createObjects(self, objectID, title, uri, predicates, statements, context): s = context[context.invokeFactory('Site', objectID)] updateObject(s, uri, predicates, context) if _memberTypeURI in predicates and len(predicates[_memberTypeURI]) > 0: s.memberType = _transformMemberType(unicode(predicates[_memberTypeURI][0])) return [CreatedObject(s)]
def createObjects(self, objectID, title, uri, predicates, statements, context): p = context[context.invokeFactory('Protocol', objectID)] updateObject(p, uri, predicates, context) return [CreatedObject(p)]
def __call__(self, rdfDataSource=None): '''Ingest and render a results page.''' context = aq_inner(self.context) catalog = getToolByName(context, 'portal_catalog') if rdfDataSource is None: rdfDataSource = context.rdfDataSource if not rdfDataSource: raise RDFIngestException(_(u'This folder has no RDF data source URL.')) normalizerFunction = queryUtility(IIDNormalizer).normalize graph = ConjunctiveGraph() graph.parse(URLInputSource(rdfDataSource)) statements = self._parseRDF(graph) createdObjects = [] handler = SiteHandler() for uri, predicates in statements.items(): results = catalog(identifier=unicode(uri), object_provides=ISite.__identifier__) objectID = handler.generateID(uri, predicates, normalizerFunction) #check if homepage URL has rdf datasource in it because of missing http prefix if _homePageURI in predicates: predicates[_homePageURI][0] = _transformHomePage(predicates[_homePageURI][0], rdfDataSource) if len(results) == 1 or objectID in context.keys(): # Existing site. Update it if objectID in context.keys(): s = context[objectID] else: s = results[0].getObject() updateObject(s, uri, predicates, context) if _memberTypeURI in predicates and len(predicates[_memberTypeURI]) > 0: s.memberType = _transformMemberType(unicode(predicates[_memberTypeURI][0])) # Reset my investigators s.setPrincipalInvestigator(None) s.setCoPrincipalInvestigators([]) s.setCoInvestigators([]) s.setInvestigators([]) # FIXME: # s.manage_delObjects(s.objectIds()) created = [CreatedObject(s)] else: if len(results) > 1: # More than one? Nuke 'em all. context.manage_delObjects([s.id for s in results]) # New site. Create it. title = handler.generateTitle(uri, predicates) created = handler.createObjects(objectID, title, uri, predicates, statements, context) for obj in created: obj.reindex() createdObjects.extend(created) self.objects = createdObjects statements, createdSites = self._updateSponsors() self._updateSiteIDs(createdSites.values()) folks = self._ingestPeople() warnings = self._updateInvestigators(statements, createdSites, folks) for site in createdSites.values(): site.obj.reindexObject() # Set the PI's UID on all members so we can search for everyone who works for his PI'liness for createdPerson in folks.itervalues(): createdPerson.piUID = createdPerson.aq_parent.piUID # CA-609: check for sites without any member type mailTool = getToolByName(aq_inner(self.context), 'MailHost') if mailTool is not None: unknowns = ['* %s (%s)' % (i.obj.title, i.obj.siteID) for i in createdSites.values() if not i.obj.memberType] if len(unknowns) > 0: urlTool = getToolByName(aq_inner(self.context), 'portal_url') portal, portalURL = urlTool.getPortalObject(), urlTool() src = portal.getProperty('email_from_address', _sa).strip() if not src: src = _sa charset = portal.getProperty('email_charset', 'utf-8') message = _message % { 'portalURL': portalURL, 'numberOfSites': len(unknowns), 'sitesList': '\n'.join(unknowns), } subject = _(u'Notice: the portal ingested some EDRN sites with NO member type') try: mailTool.send(message, mto=_na, mfrom=src, subject=subject, charset=charset) except (socket.error, smtplib.SMTPException): pass self._results = Results(self.objects, warnings) return self.renderResults()
def __call__(self, rdfDataSource=None): '''Ingest and render a results page''' context = aq_inner(self.context) _logger.info('Study Folder RDF ingest for folder at %s', '/'.join(context.getPhysicalPath())) catalog = getToolByName(context, 'portal_catalog') if rdfDataSource is None: rdfDataSource = context.rdfDataSource if not rdfDataSource: raise RDFIngestException(_(u'This folder has no RDF data source URL.')) normalizerFunction = queryUtility(IIDNormalizer).normalize t0 = time.time() graph = ConjunctiveGraph() graph.parse(URLInputSource(rdfDataSource)) statements = self._parseRDF(graph) delta = time.time() - t0 _logger.info('Took %f seconds to read and parse %s', delta, rdfDataSource) createdObjects = [] handler = StudyHandler() t0 = time.time() # First gather the protocol-to-involved investigator sites protocolToInvolvedSites = {} for uri, predicates in statements.items(): typeURI = predicates[_typeURI][0] if typeURI == _siteSpecificTypeURI: protocolID, siteID = os.path.basename(urlparse.urlparse(unicode(uri)).path).split(u'-') siteIDs = protocolToInvolvedSites.get(protocolID, set()) siteIDs.add(siteID) protocolToInvolvedSites[protocolID] = siteIDs # Now go through each protocol for uri, predicates in statements.items(): typeURI = predicates[_typeURI][0] if typeURI == _siteSpecificTypeURI: continue if unicode(uri) == u'http://edrn.nci.nih.gov/data/protocols/0': # Bad data from DMCC continue results = catalog(identifier=unicode(uri), object_provides=IProtocol.__identifier__) objectID = handler.generateID(uri, predicates, normalizerFunction) isProject = unicode(predicates.get(_projectFlagURI, ['Protocol'])[0]) == u'Project' if len(results) == 1 or objectID in context.keys(): # Existing protocol. Update it. if objectID in context.keys(): p = context[objectID] else: p = results[0].getObject() oldID = p.id updateObject(p, uri, predicates, context) p.project = True if isProject else False newID = handler.generateID(uri, predicates, normalizerFunction) if oldID != newID: # Need to update the object ID too p.setId(newID) # And set the involved investigator sites self.setInvolvedInvestigatorSites(catalog, p, protocolToInvolvedSites) created = [CreatedObject(p)] else: if len(results) > 1: # More than one? WTF? Nuke 'em all. context.manage_delObjects([p.id for p in results]) # New protocol. Create it. title = handler.generateTitle(uri, predicates) created = handler.createObjects(objectID, title, uri, predicates, statements, context) for createdObject in created: createdObject.obj.project = True if isProject else False self.setInvolvedInvestigatorSites(catalog, createdObject.obj, protocolToInvolvedSites) for obj in created: obj.reindex() createdObjects.extend(created) _logger.info('Took %f seconds to process %d statements', time.time() - t0, len(statements)) self.objects = createdObjects t0 = time.time() self.updateCollaborativeGroups(createdObjects, catalog) _logger.info('Took %f seconds to update collaborative groups', time.time() - t0) # Now add involved investigator sites to protocols self._results = Results(self.objects, warnings=[]) return self.renderResults()
def createObjects(self, objectID, title, uri, predicates, statements, context): disease = context[context.invokeFactory('Disease', objectID)] updateObject(disease, uri, predicates) if _bodySystemsAffectedPredURI in predicates: disease.setAffectedOrgans(getUIDsFromURIs(context, predicates[_bodySystemsAffectedPredURI])) return [CreatedObject(disease)]
def createObjects(self, objectID, title, uri, predicates, statements, context): bodySystem = context[context.invokeFactory('Body System', objectID)] updateObject(bodySystem, uri, predicates) return [CreatedObject(bodySystem)]
def createObjects(self, objectID, title, uri, predicates, statements, context): resource = context[context.invokeFactory('Knowledge Object', objectID)] updateObject(resource, uri, predicates) return [CreatedObject(resource)]