def main(hostname, catalog_number, credential): catalog = ErmrestCatalog('https', hostname, catalog_number, credential) pb = catalog.getPathBuilder() run_table = pb.RNASeq.Execution_Run if args.update == "F": run_data = { "Replicate": args.repRID, "Workflow": args.workflowRID, "Reference_Genome": args.referenceRID, "Input_Bag": args.inputBagRID, "Notes": args.notes, "Execution_Status": args.status, "Execution_Status_Detail": args.statusDetail.replace('\\n','\n') } entities = run_table.insert([run_data]) rid = entities[0]["RID"] else: run_data = { "RID": args.update, "Replicate": args.repRID, "Workflow": args.workflowRID, "Reference_Genome": args.referenceRID, "Input_Bag": args.inputBagRID, "Notes": args.notes, "Execution_Status": args.status, "Execution_Status_Detail": args.statusDetail.replace('\\n','\n') } entities = run_table.update([run_data]) rid = args.update print(rid)
def main(hostname, catalog_number, credential): catalog = ErmrestCatalog('https', hostname, catalog_number, credential) pb = catalog.getPathBuilder() outputBag_table = pb.RNASeq.Output_Bag if args.update == "F": outputBag_data = { "Execution_Run": args.executionRunRID, "File_Name": args.file, "File_URL": args.loc, "File_MD5": args.md5, "File_Bytes": args.bytes, "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), "Notes": args.notes, "Bag_Type": "mRNA_Replicate_Analysis" } entities = outputBag_table.insert([outputBag_data]) rid = entities[0]["RID"] else: outputBag_data = { "RID": args.update, "Execution_Run": args.executionRunRID } entities = outputBag_table.insert([outputBag_data]) rid = entities[0]["RID"] print(rid)
def add_file_to_cohort(file, description, cohort): """ Upload a file into a data collection and add that file into the set of files associated with a cohort analysis. :param file: local path to the file that should be uploaded and associated with the cohort :param description: Text that is used to describe the file that is being uploaded :param cohort: RID of the analysis cohort to which the file file should be assoicated. :return: None. """ credential = get_credential(synapseserver) store = HatracStore('https', synapseserver, credentials=credential) catalog = ErmrestCatalog('https', synapseserver, 1, credentials=credential) pb = catalog.getPathBuilder() zebrafish = pb.Zebrafish synapse = pb.Synapse collection = synapse.tables['Collection'] files = collection.insert([{'Description': description, 'URL':'dummy2'}]) newfileRID = files[0]['RID'] print('inserted file into collection {}'.format(newfileRID)) path = '/hatrac/Data/Data_{0}_{1}'.format(newfileRID, os.path.basename(file)) loc = store.put_obj(path, file) files[0]['URL'] = loc files[0]['Orig. Basename'] = os.path.basename(file) r = store.head(path) files[0]['MD5'] = r.headers['content-md5'] files[0]['#Bytes'] = r.headers['Content-Length'] files = collection.update(files) # Now link into cohort. collection_table = zebrafish.tables['Cohort Analysis_Collection'] newfileRID = collection_table.insert([{'Cohort Analysis': cohort, 'Collection': newfileRID}]) return
def add_file_to_replicant(dataset_rid, fmap, description=''): """ Upload a file into a data collection and add that file into the set of files associated with a cohort analysis. :param file: local path to the file that should be uploaded and associated with the cohort :param description: Text that is used to describe the file that is being uploaded :param cohort: RID of the analysis cohort to which the file file should be assoicated. :return: None. """ credential = get_credential(pbcserver) store = HatracStore('https', pbcserver, credentials=credential) catalog = ErmrestCatalog('https', pbcserver, 1, credentials=credential) (experiment_rid, biosample_rid, replicate_rid, filename) = fmap dirname = re.sub('_[0-9]+_pre_rec$', '', filename) filename = filename + '.mrc' path = '{}/{}'.format(dirname, filename) print('Uploading ', path) objpath = '/hatrac/commons/data/{}/{}/{}?parents=true'.format( dataset_rid, replicate_rid, os.path.basename(filename)) print('to ', objpath) loc = store.put_obj(objpath, path) print(loc) r = store.head(objpath) md5 = r.headers['content-md5'] byte_count = r.headers['Content-Length'] submit_time = r.headers['Date'] file = { 'dataset': dataset_rid, 'anatomy': pancreas, 'device': xray_tomography, 'equipment_model': 'commons:600:', 'description': description, 'url': loc, 'filename': os.path.basename(filename), 'file_type': 'commons:601:', 'byte_count': byte_count, 'submitted_on': submit_time, 'md5': md5, 'replicate': replicate_rid } print(file) pb = catalog.getPathBuilder() isa = pb.isa tomography_data = isa.tables['xray_tomography_data'] try: newrid = tomography_data.insert([file]) except: newrid = tomography_data.update([file]) return
def main(hostname, catalog_number, credential): catalog = ErmrestCatalog('https', hostname, catalog_number, credential) pb = catalog.getPathBuilder() if args.table == 'mRNA_QC': run_table = pb.RNASeq.mRNA_QC elif args.table == "Processed_File": run_table = pb.RNASeq.Processed_File path = run_table.filter(run_table.RID == args.RID) path.delete() rid = args.RID print(rid + " deleted")
def create_online_client(uri): ''' Create a client to access the public CFDE Deriva Catalog URI in the form: ${protocol}://${hostname}/chaise/recordset/#${record_number}/ ''' import re from urllib.parse import urlparse from deriva.core import ErmrestCatalog, get_credential uri_parsed = urlparse(uri) catalog_number = int(re.match(r'^(\d+)/', uri_parsed.fragment).group(1)) credential = get_credential(uri_parsed.hostname) catalog = ErmrestCatalog(uri_parsed.scheme, uri_parsed.hostname, catalog_number, credential) pb = catalog.getPathBuilder() CFDE = pb.schemas['CFDE'] return CFDE
def main(hostname, catalog_number, credential): catalog = ErmrestCatalog('https', hostname, catalog_number, credential) pb = catalog.getPathBuilder() run_table = pb.RNASeq.mRNA_QC if args.update == "F": run_data = { "Execution_Run": args.executionRunRID, "Replicate": args.repRID, "Paired_End": args.ends, "Strandedness": args.stranded, "Median_Read_Length": args.length, "Raw_Count": args.rawCount, "Final_Count": args.assignedCount, "Median_TIN": args.tin, "Notes": args.notes } entities = run_table.insert([run_data]) rid = entities[0]["RID"] elif args.update == "E": run_data = { "Execution_Run": args.executionRunRID, "Replicate": args.repRID } entities = run_table.insert([run_data]) rid = entities[0]["RID"] else: run_data = { "RID": args.update, "Execution_Run": args.executionRunRID, "Replicate": args.repRID, "Paired_End": args.ends, "Strandedness": args.stranded, "Median_Read_Length": args.length, "Raw_Count": args.rawCount, "Final_Count": args.assignedCount, "Median_TIN": args.tin, "Notes": args.notes } entities = run_table.update([run_data]) rid = args.update print(rid)
def main(hostname, catalog_number, credential): catalog = ErmrestCatalog('https', hostname, catalog_number, credential) pb = catalog.getPathBuilder() inputBag_table = pb.RNASeq.Input_Bag inputBag_data = { "File_Name": args.file, "File_URL": args.loc, "File_MD5": args.md5, "File_Bytes": args.bytes, "File_Creation_Time": datetime.now().replace(microsecond=0).isoformat(), "Notes": args.notes, "Bag_Type": "Replicate_Input_Seq" } entities = inputBag_table.insert([inputBag_data]) rid = entities[0]["RID"] print(rid)
def init_variables(catalog_num=1): server = 'pbcconsortium.isrd.isi.edu' credential = get_credential(server) catalog = ErmrestCatalog('https', server, catalog_num, credentials=credential) model_root = catalog.getCatalogModel() __main__.catalog = catalog __main__.model_root = model_root # Get references to main tables for manipulating the model. __main__.Experiment = model_root.table('Beta_Cell', 'Experiment') __main__.Specimen = model_root.table('Beta_Cell', 'Specimen') __main__.Biosample = model_root.table('Beta_Cell', 'Biosample') __main__.Dataset = model_root.table('Beta_Cell', 'Dataset') __main__.imaging_data = model_root.table('isa', 'imaging_data') __main__.model = model_root.table("viz", 'model') # Get references to the main tables for managing their contents using DataPath library pb = catalog.getPathBuilder() # Get main schema isa = pb.isa viz = pb.viz vocab = pb.vocab Beta_Cell = pb.Beta_Cell __main__.pb = pb __main__.isa = isa __main__.vocab = vocab # Get tables.... __main__.Experiment_dp = Beta_Cell.Experiment __main__.Biosample_dp = Beta_Cell.Biosample __main__.dataset_dp = Beta_Cell.Dataset __main__.XRay_Tomography_dp = Beta_Cell.XRay_Tomography_Data __main__.Specimen_dp = Beta_Cell.Specimen __main__.model_dp = viz.model
credential = get_credential(server) catalog = ErmrestCatalog('https', server, 1, credentials=credential) model_root = catalog.getCatalogModel() # Get references to main tables for manipulating the model. experiment = model_root.table('isa', 'experiment') biosample = model_root.table('isa', 'biosample') dataset = model_root.table('isa', 'dataset') protocol = model_root.table('isa','protocol') replicate = model_root.table('isa','replicate') imaging_data = model_root.table('isa','imaging_data') model = model_root.table("viz", 'model') # Get references to the main tables for managing their contents using DataPath library pb = catalog.getPathBuilder() # Get main schema isa = pb.isa viz = pb.viz # Get tables.... experiment_dp = isa.experiment biosample_dp = isa.biosample dataset_dp = isa.dataset protocol_dp = isa.protocol replicate_dp = isa.replicate xray_tomography_dp = isa.xray_tomography_data specimen_dp = isa.specimen model_dp = viz.model
class Registry(object): """CFDE Registry binding. """ def __init__(self, scheme='https', servername='app.nih-cfde.org', catalog='registry', credentials=None, session_config=None): """Bind to specified registry. Note: this binding operates as an authenticated client identity and may expose different capabilities depending on the client's role within the organization. """ if credentials is None: credentials = get_credential(servername) if not session_config: session_config = DEFAULT_SESSION_CONFIG.copy() session_config["allow_retry_on_all_methods"] = True self._catalog = ErmrestCatalog(scheme, servername, catalog, credentials, session_config=session_config) self._builder = self._catalog.getPathBuilder() def validate_dcc_id(self, dcc_id, submitting_user): """Validate that user has submitter role with this DCC according to registry. :param dcc_id: The dcc.id key of the DCC in the registry. :param submitting_user: The WebauthnUser representation of the authenticated submission user. Raises UnknownDccId for invalid DCC identifiers. Raises Forbidden if submitting_user is not a submitter for the named DCC. """ rows = self.get_dcc(dcc_id) if len(rows) < 1: raise exception.UnknownDccId(dcc_id) self.enforce_dcc_submission(dcc_id, submitting_user) def _get_entity(self, table_name, id=None): """Get one or all entity records from a registry table. :param table_name: The registry table to access. :param id: A key to retrieve one row (default None retrieves all) """ path = self._builder.CFDE.tables[table_name].path if id is not None: path = path.filter(path.table_instances[table_name]. column_definitions['id'] == id) return list(path.entities().fetch()) def list_datapackages(self): """Get a list of all datapackage submissions in the registry """ return self._get_entity('datapackage') def get_latest_approved_datapackages(self, need_dcc_appr=True, need_cfde_appr=True): """Get a map of latest datapackages approved for release for each DCC id.""" path = self._builder.CFDE.tables['datapackage'].path status = path.datapackage.status path = path.filter( (status == terms.cfde_registry_dp_status.content_ready) | (status == terms.cfde_registry_dp_status.release_pending)) if need_dcc_appr: path = path.filter(path.datapackage.dcc_approval_status == terms.cfde_registry_decision.approved) if need_cfde_appr: path = path.filter(path.datapackage.cfde_approval_status == terms.cfde_registry_decision.approved) res = {} for row in path.entities().sort(path.datapackage.submitting_dcc, path.datapackage.submission_time.desc): if row['submitting_dcc'] not in res: res[row['submitting_dcc']] = row return res def get_datapackage(self, id): """Get datapackage by submission id or raise exception. :param id: The datapackage.id key for the submission in the registry Raises DatapackageUnknown if record is not found. """ rows = self._get_entity('datapackage', id) if len(rows) < 1: raise exception.DatapackageUnknown( 'Datapackage "%s" not found in registry.' % (id, )) return rows[0] def get_datapackage_table(self, datapackage, position): """Get datapackage by submission id or raise exception. :param datapackage: The datapackage.id key for the submission in the registry :param position: The 0-based index of the table in the datapackage's list of resources Raises IndexError if record is not found. """ path = self._builder.CFDE.datapackage_table.path path = path.filter(path.datapackage_table.datapackage == datapackage) path = path.filter(path.datapackage_table.position == position) rows = list(path.entities().fetch()) if len(rows) < 1: raise IndexError( 'Datapackage table ("%s", %d) not found in registry.' % (datapackage, position)) return rows[0] def register_release(self, id, dcc_datapackages, description=None): """Idempotently register new release in registry, returning (release row, dcc_datapackages). :param id: The release.id for the new record :param dcc_datapackages: A dict mapping {dcc_id: datapackage, ...} for constituents :param description: A human-readable description of this release The constituents are a set of datapackage records (dicts) as returned by the get_datapackage() method. The dcc_id key MUST match the submitting_dcc of the record. For repeat calls on existing releases, the definition will be updated if the release is still in the planning state, but a StateError will be raised if it is no longer in planning state. """ for dcc_id, dp in dcc_datapackages.items(): if dcc_id != dp['submitting_dcc']: raise ValueError( 'Mismatch in dcc_datapackages DCC IDs %s != %s' % (dcc_id, dp['submitting_dcc'])) try: rel, old_dcc_dps = self.get_release(id) except exception.ReleaseUnknown: # create new release record newrow = { 'id': id, 'status': terms.cfde_registry_rel_status.planning, 'description': None if description is nochange else description, } defaults = [ cname for cname in self._builder.CFDE.release.column_definitions.keys() if cname not in newrow ] logger.info('Registering new release %s' % (id, )) self._catalog.post('/entity/CFDE:release?defaults=%s' % (','.join(defaults), ), json=[newrow]) rel, old_dcc_dps = self.get_release(id) if rel['status'] != terms.cfde_registry_rel_status.planning: raise exception.StateError( 'Idempotent registration disallowed on existing release %(id)s with status=%(status)s' % rel) # prepare for idempotent updates old_dp_ids = {dp['id'] for dp in old_dcc_dps.values()} dp_ids = {dp['id'] for dp in dcc_datapackages.values()} datapackages = {dp['id']: dp for dp in dcc_datapackages.values()} # idempotently revise description if rel['description'] != description: logger.info('Updating release %s description: %s' % ( id, description, )) self.update_release(id, description=description) # find currently registered constituents path = self._builder.CFDE.dcc_release_datapackage.path path = path.filter(path.dcc_release_datapackage.release == id) old_dp_ids = {row['datapackage'] for row in path.entities().fetch()} # remove stale consituents for dp_id in old_dp_ids.difference(dp_ids): logger.info('Removing constituent datapackage %s from release %s' % (dp_id, id)) self._catalog.delete( '/entity/CFDE:dcc_release_datapackage/release=%s&datapackage=%s' % ( urlquote(id), urlquote(dp_id), )) # add new consituents new_dp_ids = dp_ids.difference(old_dp_ids) if new_dp_ids: logger.info('Adding constituent datapackages %s to release %s' % (new_dp_ids, id)) self._catalog.post('/entity/CFDE:dcc_release_datapackage', json=[{ 'dcc': datapackages[dp_id]['submitting_dcc'], 'release': id, 'datapackage': dp_id, } for dp_id in new_dp_ids]) # return registry content return self.get_release(id) def get_release(self, id): """Get release by submission id or raise exception, returning (release_row, dcc_datapackages). :param id: The release.id key for the release definition in the registry Raises ReleaseUnknown if record is not found. """ rows = self._get_entity('release', id) if len(rows) < 1: raise exception.ReleaseUnknown( 'Release "%s" not found in registry.' % (id, )) rel = rows[0] path = self._builder.CFDE.dcc_release_datapackage.path path = path.filter(path.dcc_release_datapackage.release == id) path = path.link(self._builder.CFDE.datapackage) return rel, { row['submitting_dcc']: row for row in path.entities().fetch() } def register_datapackage(self, id, dcc_id, submitting_user, archive_url): """Idempotently register new submission in registry. :param id: The datapackage.id for the new record :param dcc_id: The datapackage.submitting_dcc for the new record :param submitting_user: The datapackage.submitting_user for the new record :param archive_url: The datapackage.datapackage_url for the new record May raise non-CfdeError exceptions on operational errors. """ try: return self.get_datapackage(id) except exception.DatapackageUnknown: pass # poke the submitting user into the registry's user-tracking table in case they don't exist # this acts as controlled domain table for submitting_user fkeys self._catalog.post('/entity/public:ERMrest_Client?onconflict=skip', json=[{ 'ID': submitting_user.webauthn_id, 'Display_Name': submitting_user.display_name, 'Full_Name': submitting_user.full_name, 'Email': submitting_user.email, 'Client_Object': { 'id': submitting_user.webauthn_id, 'display_name': submitting_user.display_name, } }]) newrow = { "id": id, "submitting_dcc": dcc_id, "submitting_user": submitting_user.webauthn_id, "datapackage_url": archive_url, # we need to supply these unless catalog starts giving default values for us "submission_time": datetime.datetime.utcnow().isoformat(), "status": terms.cfde_registry_dp_status.submitted, } defaults = [ cname for cname in self._builder.CFDE.datapackage.column_definitions.keys() if cname not in newrow ] self._catalog.post('/entity/CFDE:datapackage?defaults=%s' % (','.join(defaults), ), json=[newrow]) # kind of redundant, but make sure we round-trip this w/ server-applied defaults? return self.get_datapackage(id) def register_datapackage_table(self, datapackage, position, table_name): """Idempotently register new datapackage table in registry. :param datapackage: The datapackage.id for the containing datapackage :param position: The integer position of this table in the datapackage's list of resources :param table_name: The "name" field of the tabular resource """ newrow = { 'datapackage': datapackage, 'position': position, 'table_name': table_name, 'status': terms.cfde_registry_dpt_status.enumerated, 'num_rows': None, 'diagnostics': None, } rows = self._catalog.post( '/entity/CFDE:datapackage_table?onconflict=skip', json=[newrow]).json() if len(rows) == 0: # row exits self.update_datapackage_table( datapackage, position, status=terms.cfde_registry_dpt_status.enumerated) def update_release(self, id, status=nochange, description=nochange, cfde_approval_status=nochange, release_time=nochange, ermrest_url=nochange, browse_url=nochange, summary_url=nochange, diagnostics=nochange): """Idempotently update release metadata in registry. :param id: The release.id of the existing record to update :param status: The new release.status value (default nochange) :param description: The new release.description value (default nochange) :param cfde_approval_status: The new release.cfde_approval_status value (default nochange) :param release_time: The new release.release_time value (default nochange) :param ermrest_url: The new release.review_ermrest_url value (default nochange) :param browse_url: The new release.review_browse_url value (default nochange) :param summary_url: The new release.review_summary_url value (default nochange) :param diagnostics: The new release.diagnostics value (default nochange) The special `nochange` singleton value used as default for optional arguments represents the desire to keep whatever current value exists for that field in the registry. May raise non-CfdeError exceptions on operational errors. """ if not isinstance(id, str): raise TypeError('expected id of type str, not %s' % (type(id), )) existing, existing_dcc_dps = self.get_release(id) changes = { k: v for k, v in { 'status': status, 'description': description, 'cfde_approval_status': cfde_approval_status, 'release_time': release_time, 'ermrest_url': ermrest_url, 'browse_url': browse_url, 'summary_url': summary_url, 'diagnostics': diagnostics, }.items() if v is not nochange and v != existing[k] } if not changes: return changes['id'] = id self._catalog.put('/attributegroup/CFDE:release/id;%s' % (','.join([c for c in changes.keys() if c != 'id']), ), json=[changes]) def update_datapackage(self, id, status=nochange, diagnostics=nochange, review_ermrest_url=nochange, review_browse_url=nochange, review_summary_url=nochange): """Idempotently update datapackage metadata in registry. :param id: The datapackage.id of the existing record to update :param status: The new datapackage.status value (default nochange) :param diagnostics: The new datapackage.diagnostics value (default nochange) :param review_ermrest_url: The new datapackage.review_ermrest_url value (default nochange) :param review_browse_url: The new datapackage.review_browse_url value (default nochange) :param review_summary_url: The new datapackage.review_summary_url value (default nochange) The special `nochange` singleton value used as default for optional arguments represents the desire to keep whatever current value exists for that field in the registry. May raise non-CfdeError exceptions on operational errors. """ if not isinstance(id, str): raise TypeError('expected id of type str, not %s' % (type(id), )) existing = self.get_datapackage(id) changes = { k: v for k, v in { 'status': status, 'diagnostics': diagnostics, 'review_ermrest_url': review_ermrest_url, 'review_browse_url': review_browse_url, 'review_summary_url': review_summary_url, }.items() if v is not nochange and v != existing[k] } if not changes: return changes['id'] = id self._catalog.put('/attributegroup/CFDE:datapackage/id;%s' % (','.join([c for c in changes.keys() if c != 'id']), ), json=[changes]) def update_datapackage_table(self, datapackage, position, status=nochange, num_rows=nochange, diagnostics=nochange): """Idempotently update datapackage_table metadata in registry. :param datapackage: The datapackage_table.datapackage key value :param position: The datapackage_table.position key value :param status: The new datapackage_table.status value (default nochange) :param num_rows: The new datapackage_table.num_rows value (default nochange) :Param diagnostics: The new datapackage_table.diagnostics value (default nochange) """ if not isinstance(datapackage, str): raise TypeError('expected datapackage of type str, not %s' % (type(datapackage), )) if not isinstance(position, int): raise TypeError('expected id of type int, not %s' % (type(position), )) existing = self.get_datapackage_table(datapackage, position) changes = { k: v for k, v in { 'status': status, 'num_rows': num_rows, 'diagnostics': diagnostics, }.items() if v is not nochange and v != existing[k] } if not changes: return changes.update({ 'datapackage': datapackage, 'position': position, }) self._catalog.put( '/attributegroup/CFDE:datapackage_table/datapackage,position;%s' % (','.join([ c for c in changes.keys() if c not in {'datapackage', 'position'} ]), ), json=[changes]) def get_dcc(self, dcc_id=None): """Get one or all DCC records from the registry. :param dcc_id: Optional dcc.id key string to limit results to single DCC (default None) Returns a list of dict-like records representing rows of the registry dcc table, optionally restricted to a specific dcc.id key. """ return self._get_entity('dcc', dcc_id) def get_group(self, group_id=None): """Get one or all group records from the registry. :param group_id: Optional group.id key string to limit results to single group (default None) Returns a list of dict-like records representing rows of the registry group table, optionally restricted to a specific group.id key. """ return self._get_entity('group', group_id) def get_group_role(self, role_id=None): """Get one or all group-role records from the registry. :param role_id: Optional group_role.id key string to limit results to single role (default None) Returns a list of dict-like records representing rows of the registry group_role table, optionally restricted to a specific group_role.id key. """ return self._get_entity('group_role', role_id) def get_groups_by_dcc_role(self, role_id=None, dcc_id=None): """Get groups by DCC x role for one or all roles and DCCs. :param role_id: Optional role.id key string to limit results to a single group role (default None) :param dcc_id: Optional dcc.id key string to limit results to a single DCC (default None) Returns a list of dict-like records associating a DCC id, a role ID, and a list of group IDs suitable as an ACL for that particular dcc-role combination. """ # find range of possible values dccs = {row['id']: row for row in self.get_dcc(dcc_id)} roles = {row['id']: row for row in self.get_group_role(role_id)} # find mapped groups (an inner join) path = self._builder.CFDE.dcc_group_role.path.link( self._builder.CFDE.group) if role_id is not None: path = path.filter(path.dcc_group_role.role == role_id) if dcc_id is not None: path = path.filter(path.dcc_group_role.dcc == dcc_id) dcc_roles = { (row['dcc'], row['role']): row for row in path.groupby(path.dcc_group_role.dcc, path.dcc_group_role.role) \ .attributes(ArrayD(path.group).alias("groups")) \ .fetch() } # as a convenience for simple consumers, emulate a full outer # join pattern to return empty lists for missing combinations return [ ( dcc_roles[(dcc_id, role_id)] \ if (dcc_id, role_id) in dcc_roles \ else {"dcc": dcc_id, "role": role_id, "groups": []} ) for dcc_id in dccs for role_id in roles ] def get_dcc_acl(self, dcc_id, role_id): """Get groups for one DCC X group_role as a webauthn-style ACL. :param dcc_id: A dcc.id key known by the registry. :param role_id: A group_role.id key known by the registry. Returns a list of webauthn ID strings as an access control list suitable for intersection tests with WebauthnUser.acl_authz_test(). """ acl = set() for row in self.get_groups_by_dcc_role(role_id, dcc_id): acl.update({grp['webauthn_id'] for grp in row['groups']}) return list(sorted(acl)) def enforce_dcc_submission(self, dcc_id, submitting_user): """Verify that submitting_user is authorized to submit datapackages for dcc_id. :param dcc_id: The dcc.id key of the DCC in the registry :param submitting_user: The WebauthnUser representation of the user context. Raises Forbidden if user does not have submitter role for DCC. """ submitting_user.acl_authz_test( self.get_dcc_acl(dcc_id, terms.cfde_registry_grp_role.submitter), 'Submission to DCC %s is forbidden' % (dcc_id, )) @classmethod def dump_onboarding(self, registry_datapackage): """Dump onboarding info about DCCs in registry""" resources = [ resource for resource in registry_datapackage.package_def['resources'] if resource['name'] in {'dcc', 'group', 'dcc_group_role'} ] registry_datapackage.dump_data_files(resources=resources)
class DashboardQueryHelper(object): def __init__(self, hostname, catalogid, scheme='https', caching=True): session_config = DEFAULT_SESSION_CONFIG.copy() session_config["allow_retry_on_all_methods"] = True self.catalog = ErmrestCatalog(scheme, hostname, catalogid, caching=caching, session_config=session_config) self.builder = self.catalog.getPathBuilder() def run_demo(self): """Run each example query and dump all results as JSON.""" projects = {(row['id_namespace'], row['local_id']): row for row in self.list_projects(use_root_projects=True)} rid_for_parent_proj = projects[( 'cfde_id_namespace:2', '3a51534abc6e1a5ee6d9cc86c4007b56')]['RID'] # use list() to convert each ResultSet # for easier JSON serialization... results = { #'list_projects': list(self.list_projects()), #'list_root_projects': list(self.list_projects(use_root_projects=True)), #'list_datatypes': list(self.list_datatypes()), #'list_formats': list(self.list_formats()), 'root_projects': list(self.list_projects(use_root_projects=True)), 'subject_stats_assaytype_subproject': list( StatsQuery(self).entity('subject').dimension( 'assay_type').dimension( 'subproject', parent_project_RID=rid_for_parent_proj).fetch()), 'file_stats_anatomy_assaytype': list( StatsQuery(self).entity('file').dimension('anatomy').dimension( 'assay_type').fetch()), 'file_stats_anatomy_datatype': list( StatsQuery(self).entity('file').dimension('anatomy').dimension( 'data_type').fetch()), 'file_stats_anatomy_species': list( StatsQuery(self).entity('file').dimension('anatomy').dimension( 'species').fetch()), 'file_stats_anatomy_project': list( StatsQuery(self).entity('file').dimension('anatomy').dimension( 'project_root').fetch()), 'file_stats_assaytype_datatype': list( StatsQuery(self).entity('file').dimension( 'assay_type').dimension('data_type').fetch()), 'file_stats_assaytype_species': list( StatsQuery(self).entity('file').dimension( 'assay_type').dimension('species').fetch()), 'file_stats_assaytype_project': list( StatsQuery(self).entity('file').dimension( 'assay_type').dimension('project_root').fetch()), 'file_stats_datatype_species': list( StatsQuery(self).entity('file').dimension( 'data_type').dimension('species').fetch()), 'file_stats_datatype_project': list( StatsQuery(self).entity('file').dimension( 'data_type').dimension('project_root').fetch()), 'biosample_stats_anatomy_assaytype': list( StatsQuery(self).entity('biosample').dimension( 'anatomy').dimension('assay_type').fetch()), 'biosample_stats_anatomy_datatype': list( StatsQuery(self).entity('biosample').dimension( 'anatomy').dimension('data_type').fetch()), 'biosample_stats_anatomy_species': list( StatsQuery(self).entity('biosample').dimension( 'anatomy').dimension('species').fetch()), 'biosample_stats_anatomy_project': list( StatsQuery(self).entity('biosample').dimension( 'anatomy').dimension('project_root').fetch()), 'biosample_stats_assaytype_datatype': list( StatsQuery(self).entity('biosample').dimension( 'assay_type').dimension('data_type').fetch()), 'biosample_stats_assaytype_species': list( StatsQuery(self).entity('biosample').dimension( 'assay_type').dimension('species').fetch()), 'biosample_stats_assaytype_project': list( StatsQuery(self).entity('biosample').dimension( 'assay_type').dimension('project_root').fetch()), 'biosample_stats_datatype_species': list( StatsQuery(self).entity('biosample').dimension( 'data_type').dimension('species').fetch()), 'biosample_stats_datatype_project': list( StatsQuery(self).entity('biosample').dimension( 'data_type').dimension('project_root').fetch()), 'subject_stats_anatomy_assaytype': list( StatsQuery(self).entity('subject').dimension( 'anatomy').dimension('assay_type').fetch()), 'subject_stats_anatomy_datatype': list( StatsQuery(self).entity('subject').dimension( 'anatomy').dimension('data_type').fetch()), 'subject_stats_anatomy_species': list( StatsQuery(self).entity('subject').dimension( 'anatomy').dimension('species').fetch()), 'subject_stats_anatomy_project': list( StatsQuery(self).entity('subject').dimension( 'anatomy').dimension('project_root').fetch()), 'subject_stats_assaytype_datatype': list( StatsQuery(self).entity('subject').dimension( 'assay_type').dimension('data_type').fetch()), 'subject_stats_assaytype_species': list( StatsQuery(self).entity('subject').dimension( 'assay_type').dimension('species').fetch()), 'subject_stats_assaytype_project': list( StatsQuery(self).entity('subject').dimension( 'assay_type').dimension('project_root').fetch()), 'subject_stats_datatype_species': list( StatsQuery(self).entity('subject').dimension( 'data_type').dimension('species').fetch()), 'subject_stats_datatype_project': list( StatsQuery(self).entity('subject').dimension( 'data_type').dimension('project_root').fetch()), } print(json.dumps(results, indent=2)) def list_projects(self, use_root_projects=False, parent_project_RID=None, headers=DEFAULT_HEADERS): """Return list of projects AKA funded activities :param use_root_projects: Only consider root projects (default False) :param parent_project_RID: Only consider children of specified project (default None) """ children = self.builder.CFDE.project.alias("children") pip1 = self.builder.CFDE.project_in_project.alias('pip1') project = self.builder.CFDE.project path = children.path path = path.link( pip1, on=((path.children.id_namespace == pip1.child_project_id_namespace) & (path.children.local_id == pip1.child_project_local_id))).link( project, on=((pip1.parent_project_id_namespace == project.id_namespace) & (pip1.parent_project_local_id == project.local_id)), join_type='right') if use_root_projects: path = path.link(self.builder.CFDE.project_root) elif parent_project_RID is not None: pip2 = self.builder.CFDE.project_in_project.alias('pip2') parent = self.builder.CFDE.project.alias("parent") path = path.link( pip2, on=((path.project.id_namespace == pip2.child_project_id_namespace) & (path.project.local_id == pip2.child_project_local_id))).link( parent, on=((path.pip2.parent_project_id_namespace == parent.id_namespace) & (path.pip2.parent_project_local_id == parent.local_id))).filter( path.parent.RID == parent_project_RID) return path.groupby(path.project.RID, ).attributes( path.project.id_namespace, path.project.local_id, path.project.column_definitions['name'], path.project.abbreviation, path.project.description, CntD(path.children.RID).alias('num_subprojects')).fetch( headers=headers) def list_datatypes(self, headers=DEFAULT_HEADERS): """Return list of data_type terms """ return self.builder.CFDE.data_type.path.entities().fetch( headers=headers) def list_formats(self, headers=DEFAULT_HEADERS): """Return list of file format terms """ return self.builder.CFDE.file_format.path.entities().fetch( headers=headers)
class DashboardQueryHelper(object): def __init__(self, hostname, catalogid, scheme='https'): self.catalog = ErmrestCatalog(scheme, hostname, catalogid) self.builder = self.catalog.getPathBuilder() def run_demo(self): """Run each example query and dump all results as JSON.""" # use list() to convert each ResultSet # for easier JSON serialization... results = { 'list_programs': list(self.list_programs()), 'list_infotypes': list(self.list_infotypes()), 'list_formats': list(self.list_formats()), 'list_program_file_stats': list(self.list_program_file_stats()), 'list_program_sampletype_file_stats': list(self.list_program_sampletype_file_stats()), 'list_program_file_stats_by_time_bin': list(self.list_program_file_stats_by_time_bin()), 'running_sum_program_file_stats': list(self.running_sum_program_file_stats()), } print(json.dumps(results, indent=2)) def list_programs(self): """Return list of common fund programs NOTE: in demo content, program 'name' is NOT unique, e.g. GTEx occurs twice due to overlap in imports! The 'id' column is unique. """ # trivial case: just return entities of a single table return self.builder.CFDE.common_fund_program.path.entities().fetch() def list_infotypes(self): """Return list of information type terms """ return self.builder.CFDE.information_type.path.entities().fetch() def list_formats(self): """Return list of file format terms """ return self.builder.CFDE.file_format.path.entities().fetch() def list_program_file_stats(self, programid=None): """Return list of file statistics per program. Optionally filtered to a single programid. NOTE: this query will not return a row for a program with zero files... NOTE: only non-null File.length values are summed, so null may be returned if none of the files have specified a length... """ # more complex case: build joined table path path = self.builder.CFDE.dataset.path if programid is not None: path.filter(path.dataset.data_source == programid) path.link(self.builder.CFDE.files_in_datasets) path.link(self.builder.CFDE.file) # and return grouped aggregate results results = path.groupby(path.dataset.data_source, ).attributes( Cnt(path.file).alias('file_cnt'), Sum(path.file.length).alias('byte_cnt'), ) return results.fetch() def list_program_sampletype_file_stats(self, programid=None): """Return list of file statistics per (program, sample_type). Like list_program_file_stats, but also include biosample sample_type in the group key, for more detailed result cagegories. """ path = self.builder.CFDE.sample_type.path path.link(self.builder.CFDE.bio_sample) path.link(self.builder.CFDE.assayed_by) path.link(self.builder.CFDE.data_event) path.link(self.builder.CFDE.generated_by) # right-outer join so we can count files w/o this biosample/event linkage path.link(self.builder.CFDE.file, on=(path.GeneratedBy.file_id == self.builder.CFDE.file.id), join_type='right') path.link(self.builder.CFDE.files_in_datasets) path.link(self.builder.CFDE.dataset) if programid is not None: path.filter(path.dataset.data_source == programid) results = path.groupby( # compound grouping key path.dataset.data_source, path.bio_sample.sample_type.alias('sample_type_id'), ).attributes( # 'name' is part of Table API so we cannot use attribute-based lookup... path.sample_type.column_definitions['name'].alias( 'sample_type_name'), Cnt(path.file).alias('file_cnt'), Sum(path.file.length).alias('byte_cnt'), ) return results.fetch() def list_program_file_stats_by_time_bin(self, nbins=100, min_ts=None, max_ts=None): """Return list of file statistics per (data_source, ts_bin) :param nbins: The number of bins to divide the time range :param min_ts: The lower (closed) bound of the time range :param max_ts: The upper (open) bound of the time range If min_ts or max_ts are unspecified, preliminary queries are performed to determine the actual timestamp range found in the source data. These values are used to configure the binning distribution. Files generation times are found from DataEvent.event_ts where linked to File by the GeneratedBy association. Files without such linkage are considered to have null event times. Results are keyed by data_source and ts_bin group keys. NOTE: Results are sparse! Groups are only returned when at least one matching row is found. This means that some bins, described next, may be absent in a particular query result. Each group includes a ts_bin field which is a three-element list describing the time bin: [ bin_number, lower_bound, upper_bound ] The files within the selected range will be summarized in groups with bins: [ 1, min_ts, (max_ts - min_ts)/nbins ] ... [ nbins, max_ts - (max_ts - min_ts)/nbins, max_ts ] Files without known event_ts will be summarized in a row with a special null bin: Other files will be summarized in rows with special bins: [ null, null, null ] [ 0, null, min_ts ] [ nbins+1, max_ts, null ] i.e. for files with unknown event_ts, with event_ts below min_ts, or with event_ts above max_ts, respectively. """ path = self.builder.CFDE.data_event.path path.link(self.builder.CFDE.generated_by) # right-outer join so we can count files w/o this dataevent linkage path.link(self.builder.CFDE.file, on=(path.generated_by.file_id == self.builder.CFDE.file.id), join_type='right') path.link(self.builder.CFDE.files_in_datasets) path.link(self.builder.CFDE.dataset) # build this list once so we can reuse it for grouping and sorting groupkey = [ path.dataset.data_source, Bin(path.data_event.event_ts, nbins, min_ts, max_ts).alias('ts_bin'), ] results = path.groupby(*groupkey).attributes( Cnt(path.file.id).alias('file_cnt'), Sum(path.file.length).alias('byte_cnt'), ).sort(*groupkey) return results.fetch() def running_sum_program_file_stats(self, nbins=100, min_ts=None, max_ts=None): """Transform results of list_program_file_stats_by_time to produce running sums The underlying query counts files and sums bytecounts only within each time bin. I.e. it represents change rather than total data capacities at given times. This function accumulates values to show total capacity trends. """ data_source = None file_cnt = None byte_cnt = None # because underlying query results are sorted, we can just iterate... for row in self.list_program_file_stats_by_time_bin( nbins, min_ts, max_ts): if data_source != row['data_source']: # reset state for next group data_source = row['data_source'] file_cnt = 0 byte_cnt = 0 if row['file_cnt'] is not None: file_cnt += row['file_cnt'] if row['byte_cnt'] is not None: byte_cnt += row['byte_cnt'] yield { 'data_source': data_source, 'ts_bin': row['ts_bin'], 'file_cnt': file_cnt, 'byte_cnt': byte_cnt }