def main(hostname, catalog_number, credential):
    catalog = ErmrestCatalog('https', hostname, catalog_number, credential)
    pb = catalog.getPathBuilder()
    run_table = pb.RNASeq.Execution_Run

    if args.update == "F":
        run_data = {
            "Replicate": args.repRID,
            "Workflow": args.workflowRID,
            "Reference_Genome": args.referenceRID,
            "Input_Bag": args.inputBagRID,
            "Notes": args.notes,
            "Execution_Status": args.status,
            "Execution_Status_Detail": args.statusDetail.replace('\\n','\n')
        }
        entities = run_table.insert([run_data])
        rid = entities[0]["RID"]
    else:
        run_data = {
            "RID": args.update,
            "Replicate": args.repRID,
            "Workflow": args.workflowRID,
            "Reference_Genome": args.referenceRID,
            "Input_Bag": args.inputBagRID,
            "Notes": args.notes,
            "Execution_Status": args.status,
            "Execution_Status_Detail": args.statusDetail.replace('\\n','\n')
        }
        entities = run_table.update([run_data])
        rid = args.update

    print(rid)
示例#2
0
def main(hostname, catalog_number, credential):
    catalog = ErmrestCatalog('https', hostname, catalog_number, credential)
    pb = catalog.getPathBuilder()
    outputBag_table = pb.RNASeq.Output_Bag

    if args.update == "F":
        outputBag_data = {
            "Execution_Run": args.executionRunRID,
            "File_Name": args.file,
            "File_URL": args.loc,
            "File_MD5": args.md5,
            "File_Bytes": args.bytes,
            "File_Creation_Time":
            datetime.now().replace(microsecond=0).isoformat(),
            "Notes": args.notes,
            "Bag_Type": "mRNA_Replicate_Analysis"
        }
        entities = outputBag_table.insert([outputBag_data])
        rid = entities[0]["RID"]

    else:
        outputBag_data = {
            "RID": args.update,
            "Execution_Run": args.executionRunRID
        }
        entities = outputBag_table.insert([outputBag_data])
        rid = entities[0]["RID"]

    print(rid)
def add_file_to_cohort(file, description, cohort):
    """
    Upload a file into a data collection and add that file into the set of files associated with a cohort analysis.
    :param file: local path to the file that should be uploaded and associated with the cohort
    :param description: Text that is used to describe the file that is being uploaded
    :param cohort: RID of the analysis cohort to which the file file should be assoicated.
    :return: None.
    """
    credential = get_credential(synapseserver)
    store = HatracStore('https', synapseserver, credentials=credential)
    catalog = ErmrestCatalog('https', synapseserver, 1, credentials=credential)

    pb = catalog.getPathBuilder()
    zebrafish = pb.Zebrafish
    synapse = pb.Synapse

    collection = synapse.tables['Collection']
    files = collection.insert([{'Description': description, 'URL':'dummy2'}])
    newfileRID = files[0]['RID']
    print('inserted file into collection {}'.format(newfileRID))
    path = '/hatrac/Data/Data_{0}_{1}'.format(newfileRID, os.path.basename(file))
    loc = store.put_obj(path, file)
    files[0]['URL'] = loc
    files[0]['Orig. Basename'] = os.path.basename(file)

    r = store.head(path)
    files[0]['MD5'] = r.headers['content-md5']
    files[0]['#Bytes'] = r.headers['Content-Length']
    files = collection.update(files)

    # Now link into cohort.
    collection_table = zebrafish.tables['Cohort Analysis_Collection']
    newfileRID = collection_table.insert([{'Cohort Analysis': cohort, 'Collection': newfileRID}])
    return
def add_file_to_replicant(dataset_rid, fmap, description=''):
    """
    Upload a file into a data collection and add that file into the set of files associated with a cohort analysis.
    :param file: local path to the file that should be uploaded and associated with the cohort
    :param description: Text that is used to describe the file that is being uploaded
    :param cohort: RID of the analysis cohort to which the file file should be assoicated.
    :return: None.
    """
    credential = get_credential(pbcserver)
    store = HatracStore('https', pbcserver, credentials=credential)
    catalog = ErmrestCatalog('https', pbcserver, 1, credentials=credential)

    (experiment_rid, biosample_rid, replicate_rid, filename) = fmap
    dirname = re.sub('_[0-9]+_pre_rec$', '', filename)
    filename = filename + '.mrc'
    path = '{}/{}'.format(dirname, filename)
    print('Uploading ', path)
    objpath = '/hatrac/commons/data/{}/{}/{}?parents=true'.format(
        dataset_rid, replicate_rid, os.path.basename(filename))
    print('to ', objpath)
    loc = store.put_obj(objpath, path)
    print(loc)
    r = store.head(objpath)
    md5 = r.headers['content-md5']
    byte_count = r.headers['Content-Length']
    submit_time = r.headers['Date']

    file = {
        'dataset': dataset_rid,
        'anatomy': pancreas,
        'device': xray_tomography,
        'equipment_model': 'commons:600:',
        'description': description,
        'url': loc,
        'filename': os.path.basename(filename),
        'file_type': 'commons:601:',
        'byte_count': byte_count,
        'submitted_on': submit_time,
        'md5': md5,
        'replicate': replicate_rid
    }
    print(file)

    pb = catalog.getPathBuilder()
    isa = pb.isa

    tomography_data = isa.tables['xray_tomography_data']
    try:
        newrid = tomography_data.insert([file])
    except:
        newrid = tomography_data.update([file])
    return
示例#5
0
def main(hostname, catalog_number, credential):
    catalog = ErmrestCatalog('https', hostname, catalog_number, credential)
    pb = catalog.getPathBuilder()
    if args.table == 'mRNA_QC':
        run_table = pb.RNASeq.mRNA_QC
    elif args.table == "Processed_File":
        run_table = pb.RNASeq.Processed_File

    path = run_table.filter(run_table.RID == args.RID)
    path.delete()
    rid = args.RID

    print(rid + " deleted")
示例#6
0
def create_online_client(uri):
    ''' Create a client to access the public CFDE Deriva Catalog
  URI in the form: ${protocol}://${hostname}/chaise/recordset/#${record_number}/
  '''
    import re
    from urllib.parse import urlparse
    from deriva.core import ErmrestCatalog, get_credential
    uri_parsed = urlparse(uri)
    catalog_number = int(re.match(r'^(\d+)/', uri_parsed.fragment).group(1))
    credential = get_credential(uri_parsed.hostname)
    catalog = ErmrestCatalog(uri_parsed.scheme, uri_parsed.hostname,
                             catalog_number, credential)
    pb = catalog.getPathBuilder()
    CFDE = pb.schemas['CFDE']
    return CFDE
示例#7
0
def main(hostname, catalog_number, credential):
    catalog = ErmrestCatalog('https', hostname, catalog_number, credential)
    pb = catalog.getPathBuilder()
    run_table = pb.RNASeq.mRNA_QC

    if args.update == "F":
        run_data = {
            "Execution_Run": args.executionRunRID,
            "Replicate": args.repRID,
            "Paired_End": args.ends,
            "Strandedness": args.stranded,
            "Median_Read_Length": args.length,
            "Raw_Count": args.rawCount,
            "Final_Count": args.assignedCount,
            "Median_TIN": args.tin,
            "Notes": args.notes
        }
        entities = run_table.insert([run_data])
        rid = entities[0]["RID"]
    elif args.update == "E":
        run_data = {
            "Execution_Run": args.executionRunRID,
            "Replicate": args.repRID
        }
        entities = run_table.insert([run_data])
        rid = entities[0]["RID"]
    else:
        run_data = {
            "RID": args.update,
            "Execution_Run": args.executionRunRID,
            "Replicate": args.repRID,
            "Paired_End": args.ends,
            "Strandedness": args.stranded,
            "Median_Read_Length": args.length,
            "Raw_Count": args.rawCount,
            "Final_Count": args.assignedCount,
            "Median_TIN": args.tin,
            "Notes": args.notes
        }
        entities = run_table.update([run_data])
        rid = args.update

    print(rid)
示例#8
0
def main(hostname, catalog_number, credential):
    catalog = ErmrestCatalog('https', hostname, catalog_number, credential)
    pb = catalog.getPathBuilder()
    inputBag_table = pb.RNASeq.Input_Bag

    inputBag_data = {
        "File_Name": args.file,
        "File_URL": args.loc,
        "File_MD5": args.md5,
        "File_Bytes": args.bytes,
        "File_Creation_Time":
        datetime.now().replace(microsecond=0).isoformat(),
        "Notes": args.notes,
        "Bag_Type": "Replicate_Input_Seq"
    }

    entities = inputBag_table.insert([inputBag_data])
    rid = entities[0]["RID"]

    print(rid)
示例#9
0
def init_variables(catalog_num=1):
    server = 'pbcconsortium.isrd.isi.edu'
    credential = get_credential(server)
    catalog = ErmrestCatalog('https',
                             server,
                             catalog_num,
                             credentials=credential)
    model_root = catalog.getCatalogModel()

    __main__.catalog = catalog
    __main__.model_root = model_root

    # Get references to main tables for manipulating the model.
    __main__.Experiment = model_root.table('Beta_Cell', 'Experiment')
    __main__.Specimen = model_root.table('Beta_Cell', 'Specimen')
    __main__.Biosample = model_root.table('Beta_Cell', 'Biosample')
    __main__.Dataset = model_root.table('Beta_Cell', 'Dataset')
    __main__.imaging_data = model_root.table('isa', 'imaging_data')
    __main__.model = model_root.table("viz", 'model')

    # Get references to the main tables for managing their contents using DataPath library
    pb = catalog.getPathBuilder()
    # Get main schema
    isa = pb.isa
    viz = pb.viz
    vocab = pb.vocab
    Beta_Cell = pb.Beta_Cell

    __main__.pb = pb
    __main__.isa = isa
    __main__.vocab = vocab

    # Get tables....
    __main__.Experiment_dp = Beta_Cell.Experiment
    __main__.Biosample_dp = Beta_Cell.Biosample
    __main__.dataset_dp = Beta_Cell.Dataset
    __main__.XRay_Tomography_dp = Beta_Cell.XRay_Tomography_Data
    __main__.Specimen_dp = Beta_Cell.Specimen
    __main__.model_dp = viz.model
credential = get_credential(server)
catalog = ErmrestCatalog('https', server, 1, credentials=credential)
model_root = catalog.getCatalogModel()

# Get references to main tables for manipulating the model.
experiment = model_root.table('isa', 'experiment')
biosample = model_root.table('isa', 'biosample')
dataset = model_root.table('isa', 'dataset')
protocol = model_root.table('isa','protocol')
replicate = model_root.table('isa','replicate')
imaging_data = model_root.table('isa','imaging_data')
model = model_root.table("viz", 'model')


# Get references to the main tables for managing their contents using DataPath library
pb = catalog.getPathBuilder()
# Get main schema
isa = pb.isa
viz = pb.viz

# Get tables....
experiment_dp = isa.experiment
biosample_dp = isa.biosample
dataset_dp = isa.dataset
protocol_dp = isa.protocol
replicate_dp = isa.replicate
xray_tomography_dp = isa.xray_tomography_data
specimen_dp = isa.specimen
model_dp = viz.model

示例#11
0
class Registry(object):
    """CFDE Registry binding.

    """
    def __init__(self,
                 scheme='https',
                 servername='app.nih-cfde.org',
                 catalog='registry',
                 credentials=None,
                 session_config=None):
        """Bind to specified registry.

        Note: this binding operates as an authenticated client
        identity and may expose different capabilities depending on
        the client's role within the organization.
        """
        if credentials is None:
            credentials = get_credential(servername)
        if not session_config:
            session_config = DEFAULT_SESSION_CONFIG.copy()
        session_config["allow_retry_on_all_methods"] = True
        self._catalog = ErmrestCatalog(scheme,
                                       servername,
                                       catalog,
                                       credentials,
                                       session_config=session_config)
        self._builder = self._catalog.getPathBuilder()

    def validate_dcc_id(self, dcc_id, submitting_user):
        """Validate that user has submitter role with this DCC according to registry.

        :param dcc_id: The dcc.id key of the DCC in the registry.
        :param submitting_user: The WebauthnUser representation of the authenticated submission user.

        Raises UnknownDccId for invalid DCC identifiers.
        Raises Forbidden if submitting_user is not a submitter for the named DCC.
        """
        rows = self.get_dcc(dcc_id)
        if len(rows) < 1:
            raise exception.UnknownDccId(dcc_id)
        self.enforce_dcc_submission(dcc_id, submitting_user)

    def _get_entity(self, table_name, id=None):
        """Get one or all entity records from a registry table.

        :param table_name: The registry table to access.
        :param id: A key to retrieve one row (default None retrieves all)
        """
        path = self._builder.CFDE.tables[table_name].path
        if id is not None:
            path = path.filter(path.table_instances[table_name].
                               column_definitions['id'] == id)
        return list(path.entities().fetch())

    def list_datapackages(self):
        """Get a list of all datapackage submissions in the registry

        """
        return self._get_entity('datapackage')

    def get_latest_approved_datapackages(self,
                                         need_dcc_appr=True,
                                         need_cfde_appr=True):
        """Get a map of latest datapackages approved for release for each DCC id."""
        path = self._builder.CFDE.tables['datapackage'].path
        status = path.datapackage.status
        path = path.filter(
            (status == terms.cfde_registry_dp_status.content_ready)
            | (status == terms.cfde_registry_dp_status.release_pending))
        if need_dcc_appr:
            path = path.filter(path.datapackage.dcc_approval_status ==
                               terms.cfde_registry_decision.approved)
        if need_cfde_appr:
            path = path.filter(path.datapackage.cfde_approval_status ==
                               terms.cfde_registry_decision.approved)
        res = {}
        for row in path.entities().sort(path.datapackage.submitting_dcc,
                                        path.datapackage.submission_time.desc):
            if row['submitting_dcc'] not in res:
                res[row['submitting_dcc']] = row
        return res

    def get_datapackage(self, id):
        """Get datapackage by submission id or raise exception.
        
        :param id: The datapackage.id key for the submission in the registry

        Raises DatapackageUnknown if record is not found.
        """
        rows = self._get_entity('datapackage', id)
        if len(rows) < 1:
            raise exception.DatapackageUnknown(
                'Datapackage "%s" not found in registry.' % (id, ))
        return rows[0]

    def get_datapackage_table(self, datapackage, position):
        """Get datapackage by submission id or raise exception.

        :param datapackage: The datapackage.id key for the submission in the registry
        :param position: The 0-based index of the table in the datapackage's list of resources

        Raises IndexError if record is not found.
        """
        path = self._builder.CFDE.datapackage_table.path
        path = path.filter(path.datapackage_table.datapackage == datapackage)
        path = path.filter(path.datapackage_table.position == position)
        rows = list(path.entities().fetch())
        if len(rows) < 1:
            raise IndexError(
                'Datapackage table ("%s", %d) not found in registry.' %
                (datapackage, position))
        return rows[0]

    def register_release(self, id, dcc_datapackages, description=None):
        """Idempotently register new release in registry, returning (release row, dcc_datapackages).

        :param id: The release.id for the new record
        :param dcc_datapackages: A dict mapping {dcc_id: datapackage, ...} for constituents
        :param description: A human-readable description of this release

        The constituents are a set of datapackage records (dicts) as
        returned by the get_datapackage() method. The dcc_id key MUST
        match the submitting_dcc of the record.

        For repeat calls on existing releases, the definition will be
        updated if the release is still in the planning state, but a
        StateError will be raised if it is no longer in planning state.

        """
        for dcc_id, dp in dcc_datapackages.items():
            if dcc_id != dp['submitting_dcc']:
                raise ValueError(
                    'Mismatch in dcc_datapackages DCC IDs %s != %s' %
                    (dcc_id, dp['submitting_dcc']))

        try:
            rel, old_dcc_dps = self.get_release(id)
        except exception.ReleaseUnknown:
            # create new release record
            newrow = {
                'id': id,
                'status': terms.cfde_registry_rel_status.planning,
                'description':
                None if description is nochange else description,
            }
            defaults = [
                cname for cname in
                self._builder.CFDE.release.column_definitions.keys()
                if cname not in newrow
            ]
            logger.info('Registering new release %s' % (id, ))
            self._catalog.post('/entity/CFDE:release?defaults=%s' %
                               (','.join(defaults), ),
                               json=[newrow])
            rel, old_dcc_dps = self.get_release(id)

        if rel['status'] != terms.cfde_registry_rel_status.planning:
            raise exception.StateError(
                'Idempotent registration disallowed on existing release %(id)s with status=%(status)s'
                % rel)

        # prepare for idempotent updates
        old_dp_ids = {dp['id'] for dp in old_dcc_dps.values()}
        dp_ids = {dp['id'] for dp in dcc_datapackages.values()}
        datapackages = {dp['id']: dp for dp in dcc_datapackages.values()}

        # idempotently revise description
        if rel['description'] != description:
            logger.info('Updating release %s description: %s' % (
                id,
                description,
            ))
            self.update_release(id, description=description)

        # find currently registered constituents
        path = self._builder.CFDE.dcc_release_datapackage.path
        path = path.filter(path.dcc_release_datapackage.release == id)
        old_dp_ids = {row['datapackage'] for row in path.entities().fetch()}

        # remove stale consituents
        for dp_id in old_dp_ids.difference(dp_ids):
            logger.info('Removing constituent datapackage %s from release %s' %
                        (dp_id, id))
            self._catalog.delete(
                '/entity/CFDE:dcc_release_datapackage/release=%s&datapackage=%s'
                % (
                    urlquote(id),
                    urlquote(dp_id),
                ))

        # add new consituents
        new_dp_ids = dp_ids.difference(old_dp_ids)
        if new_dp_ids:
            logger.info('Adding constituent datapackages %s to release %s' %
                        (new_dp_ids, id))
            self._catalog.post('/entity/CFDE:dcc_release_datapackage',
                               json=[{
                                   'dcc':
                                   datapackages[dp_id]['submitting_dcc'],
                                   'release':
                                   id,
                                   'datapackage':
                                   dp_id,
                               } for dp_id in new_dp_ids])

        # return registry content
        return self.get_release(id)

    def get_release(self, id):
        """Get release by submission id or raise exception, returning (release_row, dcc_datapackages).
        
        :param id: The release.id key for the release definition in the registry

        Raises ReleaseUnknown if record is not found.
        """
        rows = self._get_entity('release', id)
        if len(rows) < 1:
            raise exception.ReleaseUnknown(
                'Release "%s" not found in registry.' % (id, ))
        rel = rows[0]
        path = self._builder.CFDE.dcc_release_datapackage.path
        path = path.filter(path.dcc_release_datapackage.release == id)
        path = path.link(self._builder.CFDE.datapackage)
        return rel, {
            row['submitting_dcc']: row
            for row in path.entities().fetch()
        }

    def register_datapackage(self, id, dcc_id, submitting_user, archive_url):
        """Idempotently register new submission in registry.

        :param id: The datapackage.id for the new record
        :param dcc_id: The datapackage.submitting_dcc for the new record
        :param submitting_user: The datapackage.submitting_user for the new record
        :param archive_url: The datapackage.datapackage_url for the new record

        May raise non-CfdeError exceptions on operational errors.
        """
        try:
            return self.get_datapackage(id)
        except exception.DatapackageUnknown:
            pass

        # poke the submitting user into the registry's user-tracking table in case they don't exist
        # this acts as controlled domain table for submitting_user fkeys
        self._catalog.post('/entity/public:ERMrest_Client?onconflict=skip',
                           json=[{
                               'ID': submitting_user.webauthn_id,
                               'Display_Name': submitting_user.display_name,
                               'Full_Name': submitting_user.full_name,
                               'Email': submitting_user.email,
                               'Client_Object': {
                                   'id': submitting_user.webauthn_id,
                                   'display_name':
                                   submitting_user.display_name,
                               }
                           }])

        newrow = {
            "id": id,
            "submitting_dcc": dcc_id,
            "submitting_user": submitting_user.webauthn_id,
            "datapackage_url": archive_url,
            # we need to supply these unless catalog starts giving default values for us
            "submission_time": datetime.datetime.utcnow().isoformat(),
            "status": terms.cfde_registry_dp_status.submitted,
        }
        defaults = [
            cname for cname in
            self._builder.CFDE.datapackage.column_definitions.keys()
            if cname not in newrow
        ]
        self._catalog.post('/entity/CFDE:datapackage?defaults=%s' %
                           (','.join(defaults), ),
                           json=[newrow])
        # kind of redundant, but make sure we round-trip this w/ server-applied defaults?
        return self.get_datapackage(id)

    def register_datapackage_table(self, datapackage, position, table_name):
        """Idempotently register new datapackage table in registry.

        :param datapackage: The datapackage.id for the containing datapackage
        :param position: The integer position of this table in the datapackage's list of resources
        :param table_name: The "name" field of the tabular resource

        """
        newrow = {
            'datapackage': datapackage,
            'position': position,
            'table_name': table_name,
            'status': terms.cfde_registry_dpt_status.enumerated,
            'num_rows': None,
            'diagnostics': None,
        }

        rows = self._catalog.post(
            '/entity/CFDE:datapackage_table?onconflict=skip',
            json=[newrow]).json()

        if len(rows) == 0:
            # row exits
            self.update_datapackage_table(
                datapackage,
                position,
                status=terms.cfde_registry_dpt_status.enumerated)

    def update_release(self,
                       id,
                       status=nochange,
                       description=nochange,
                       cfde_approval_status=nochange,
                       release_time=nochange,
                       ermrest_url=nochange,
                       browse_url=nochange,
                       summary_url=nochange,
                       diagnostics=nochange):
        """Idempotently update release metadata in registry.

        :param id: The release.id of the existing record to update
        :param status: The new release.status value (default nochange)
        :param description: The new release.description value (default nochange)
        :param cfde_approval_status: The new release.cfde_approval_status value (default nochange)
        :param release_time: The new release.release_time value (default nochange)
        :param ermrest_url: The new release.review_ermrest_url value (default nochange)
        :param browse_url: The new release.review_browse_url value (default nochange)
        :param summary_url: The new release.review_summary_url value (default nochange)
        :param diagnostics: The new release.diagnostics value (default nochange)

        The special `nochange` singleton value used as default for
        optional arguments represents the desire to keep whatever
        current value exists for that field in the registry.

        May raise non-CfdeError exceptions on operational errors.
        """
        if not isinstance(id, str):
            raise TypeError('expected id of type str, not %s' % (type(id), ))
        existing, existing_dcc_dps = self.get_release(id)
        changes = {
            k: v
            for k, v in {
                'status': status,
                'description': description,
                'cfde_approval_status': cfde_approval_status,
                'release_time': release_time,
                'ermrest_url': ermrest_url,
                'browse_url': browse_url,
                'summary_url': summary_url,
                'diagnostics': diagnostics,
            }.items() if v is not nochange and v != existing[k]
        }
        if not changes:
            return
        changes['id'] = id
        self._catalog.put('/attributegroup/CFDE:release/id;%s' %
                          (','.join([c
                                     for c in changes.keys() if c != 'id']), ),
                          json=[changes])

    def update_datapackage(self,
                           id,
                           status=nochange,
                           diagnostics=nochange,
                           review_ermrest_url=nochange,
                           review_browse_url=nochange,
                           review_summary_url=nochange):
        """Idempotently update datapackage metadata in registry.

        :param id: The datapackage.id of the existing record to update
        :param status: The new datapackage.status value (default nochange)
        :param diagnostics: The new datapackage.diagnostics value (default nochange)
        :param review_ermrest_url: The new datapackage.review_ermrest_url value (default nochange)
        :param review_browse_url: The new datapackage.review_browse_url value (default nochange)
        :param review_summary_url: The new datapackage.review_summary_url value (default nochange)

        The special `nochange` singleton value used as default for
        optional arguments represents the desire to keep whatever
        current value exists for that field in the registry.

        May raise non-CfdeError exceptions on operational errors.
        """
        if not isinstance(id, str):
            raise TypeError('expected id of type str, not %s' % (type(id), ))
        existing = self.get_datapackage(id)
        changes = {
            k: v
            for k, v in {
                'status': status,
                'diagnostics': diagnostics,
                'review_ermrest_url': review_ermrest_url,
                'review_browse_url': review_browse_url,
                'review_summary_url': review_summary_url,
            }.items() if v is not nochange and v != existing[k]
        }
        if not changes:
            return
        changes['id'] = id
        self._catalog.put('/attributegroup/CFDE:datapackage/id;%s' %
                          (','.join([c
                                     for c in changes.keys() if c != 'id']), ),
                          json=[changes])

    def update_datapackage_table(self,
                                 datapackage,
                                 position,
                                 status=nochange,
                                 num_rows=nochange,
                                 diagnostics=nochange):
        """Idempotently update datapackage_table metadata in registry.

        :param datapackage: The datapackage_table.datapackage key value
        :param position: The datapackage_table.position key value
        :param status: The new datapackage_table.status value (default nochange)
        :param num_rows: The new datapackage_table.num_rows value (default nochange)
        :Param diagnostics: The new datapackage_table.diagnostics value (default nochange)

        """
        if not isinstance(datapackage, str):
            raise TypeError('expected datapackage of type str, not %s' %
                            (type(datapackage), ))
        if not isinstance(position, int):
            raise TypeError('expected id of type int, not %s' %
                            (type(position), ))
        existing = self.get_datapackage_table(datapackage, position)
        changes = {
            k: v
            for k, v in {
                'status': status,
                'num_rows': num_rows,
                'diagnostics': diagnostics,
            }.items() if v is not nochange and v != existing[k]
        }
        if not changes:
            return
        changes.update({
            'datapackage': datapackage,
            'position': position,
        })
        self._catalog.put(
            '/attributegroup/CFDE:datapackage_table/datapackage,position;%s' %
            (','.join([
                c for c in changes.keys()
                if c not in {'datapackage', 'position'}
            ]), ),
            json=[changes])

    def get_dcc(self, dcc_id=None):
        """Get one or all DCC records from the registry.

        :param dcc_id: Optional dcc.id key string to limit results to single DCC (default None)

        Returns a list of dict-like records representing rows of the
        registry dcc table, optionally restricted to a specific dcc.id
        key.
        """
        return self._get_entity('dcc', dcc_id)

    def get_group(self, group_id=None):
        """Get one or all group records from the registry.

        :param group_id: Optional group.id key string to limit results to single group (default None)

        Returns a list of dict-like records representing rows of the
        registry group table, optionally restricted to a specific group.id
        key.
        """
        return self._get_entity('group', group_id)

    def get_group_role(self, role_id=None):
        """Get one or all group-role records from the registry.

        :param role_id: Optional group_role.id key string to limit results to single role (default None)

        Returns a list of dict-like records representing rows of the
        registry group_role table, optionally restricted to a specific
        group_role.id key.
        """
        return self._get_entity('group_role', role_id)

    def get_groups_by_dcc_role(self, role_id=None, dcc_id=None):
        """Get groups by DCC x role for one or all roles and DCCs.

        :param role_id: Optional role.id key string to limit results to a single group role (default None)
        :param dcc_id: Optional dcc.id key string to limit results to a single DCC (default None)

        Returns a list of dict-like records associating a DCC id, a
        role ID, and a list of group IDs suitable as an ACL for that
        particular dcc-role combination.
        """
        # find range of possible values
        dccs = {row['id']: row for row in self.get_dcc(dcc_id)}
        roles = {row['id']: row for row in self.get_group_role(role_id)}

        # find mapped groups (an inner join)
        path = self._builder.CFDE.dcc_group_role.path.link(
            self._builder.CFDE.group)
        if role_id is not None:
            path = path.filter(path.dcc_group_role.role == role_id)
        if dcc_id is not None:
            path = path.filter(path.dcc_group_role.dcc == dcc_id)
        dcc_roles = {
            (row['dcc'], row['role']): row
            for row in path.groupby(path.dcc_group_role.dcc, path.dcc_group_role.role) \
            .attributes(ArrayD(path.group).alias("groups")) \
            .fetch()
        }

        # as a convenience for simple consumers, emulate a full outer
        # join pattern to return empty lists for missing combinations
        return [
            (
                dcc_roles[(dcc_id, role_id)] \
                if (dcc_id, role_id) in dcc_roles \
                else {"dcc": dcc_id, "role": role_id, "groups": []}
            )
            for dcc_id in dccs
            for role_id in roles
        ]

    def get_dcc_acl(self, dcc_id, role_id):
        """Get groups for one DCC X group_role as a webauthn-style ACL.

        :param dcc_id: A dcc.id key known by the registry.
        :param role_id: A group_role.id key known by the registry.

        Returns a list of webauthn ID strings as an access control
        list suitable for intersection tests with
        WebauthnUser.acl_authz_test().
        """
        acl = set()
        for row in self.get_groups_by_dcc_role(role_id, dcc_id):
            acl.update({grp['webauthn_id'] for grp in row['groups']})
        return list(sorted(acl))

    def enforce_dcc_submission(self, dcc_id, submitting_user):
        """Verify that submitting_user is authorized to submit datapackages for dcc_id.

        :param dcc_id: The dcc.id key of the DCC in the registry
        :param submitting_user: The WebauthnUser representation of the user context.

        Raises Forbidden if user does not have submitter role for DCC.
        """
        submitting_user.acl_authz_test(
            self.get_dcc_acl(dcc_id, terms.cfde_registry_grp_role.submitter),
            'Submission to DCC %s is forbidden' % (dcc_id, ))

    @classmethod
    def dump_onboarding(self, registry_datapackage):
        """Dump onboarding info about DCCs in registry"""
        resources = [
            resource
            for resource in registry_datapackage.package_def['resources']
            if resource['name'] in {'dcc', 'group', 'dcc_group_role'}
        ]
        registry_datapackage.dump_data_files(resources=resources)
示例#12
0
class DashboardQueryHelper(object):
    def __init__(self, hostname, catalogid, scheme='https', caching=True):
        session_config = DEFAULT_SESSION_CONFIG.copy()
        session_config["allow_retry_on_all_methods"] = True
        self.catalog = ErmrestCatalog(scheme,
                                      hostname,
                                      catalogid,
                                      caching=caching,
                                      session_config=session_config)
        self.builder = self.catalog.getPathBuilder()

    def run_demo(self):
        """Run each example query and dump all results as JSON."""
        projects = {(row['id_namespace'], row['local_id']): row
                    for row in self.list_projects(use_root_projects=True)}

        rid_for_parent_proj = projects[(
            'cfde_id_namespace:2', '3a51534abc6e1a5ee6d9cc86c4007b56')]['RID']

        # use list() to convert each ResultSet
        # for easier JSON serialization...
        results = {
            #'list_projects': list(self.list_projects()),
            #'list_root_projects': list(self.list_projects(use_root_projects=True)),
            #'list_datatypes': list(self.list_datatypes()),
            #'list_formats': list(self.list_formats()),
            'root_projects':
            list(self.list_projects(use_root_projects=True)),
            'subject_stats_assaytype_subproject':
            list(
                StatsQuery(self).entity('subject').dimension(
                    'assay_type').dimension(
                        'subproject',
                        parent_project_RID=rid_for_parent_proj).fetch()),
            'file_stats_anatomy_assaytype':
            list(
                StatsQuery(self).entity('file').dimension('anatomy').dimension(
                    'assay_type').fetch()),
            'file_stats_anatomy_datatype':
            list(
                StatsQuery(self).entity('file').dimension('anatomy').dimension(
                    'data_type').fetch()),
            'file_stats_anatomy_species':
            list(
                StatsQuery(self).entity('file').dimension('anatomy').dimension(
                    'species').fetch()),
            'file_stats_anatomy_project':
            list(
                StatsQuery(self).entity('file').dimension('anatomy').dimension(
                    'project_root').fetch()),
            'file_stats_assaytype_datatype':
            list(
                StatsQuery(self).entity('file').dimension(
                    'assay_type').dimension('data_type').fetch()),
            'file_stats_assaytype_species':
            list(
                StatsQuery(self).entity('file').dimension(
                    'assay_type').dimension('species').fetch()),
            'file_stats_assaytype_project':
            list(
                StatsQuery(self).entity('file').dimension(
                    'assay_type').dimension('project_root').fetch()),
            'file_stats_datatype_species':
            list(
                StatsQuery(self).entity('file').dimension(
                    'data_type').dimension('species').fetch()),
            'file_stats_datatype_project':
            list(
                StatsQuery(self).entity('file').dimension(
                    'data_type').dimension('project_root').fetch()),
            'biosample_stats_anatomy_assaytype':
            list(
                StatsQuery(self).entity('biosample').dimension(
                    'anatomy').dimension('assay_type').fetch()),
            'biosample_stats_anatomy_datatype':
            list(
                StatsQuery(self).entity('biosample').dimension(
                    'anatomy').dimension('data_type').fetch()),
            'biosample_stats_anatomy_species':
            list(
                StatsQuery(self).entity('biosample').dimension(
                    'anatomy').dimension('species').fetch()),
            'biosample_stats_anatomy_project':
            list(
                StatsQuery(self).entity('biosample').dimension(
                    'anatomy').dimension('project_root').fetch()),
            'biosample_stats_assaytype_datatype':
            list(
                StatsQuery(self).entity('biosample').dimension(
                    'assay_type').dimension('data_type').fetch()),
            'biosample_stats_assaytype_species':
            list(
                StatsQuery(self).entity('biosample').dimension(
                    'assay_type').dimension('species').fetch()),
            'biosample_stats_assaytype_project':
            list(
                StatsQuery(self).entity('biosample').dimension(
                    'assay_type').dimension('project_root').fetch()),
            'biosample_stats_datatype_species':
            list(
                StatsQuery(self).entity('biosample').dimension(
                    'data_type').dimension('species').fetch()),
            'biosample_stats_datatype_project':
            list(
                StatsQuery(self).entity('biosample').dimension(
                    'data_type').dimension('project_root').fetch()),
            'subject_stats_anatomy_assaytype':
            list(
                StatsQuery(self).entity('subject').dimension(
                    'anatomy').dimension('assay_type').fetch()),
            'subject_stats_anatomy_datatype':
            list(
                StatsQuery(self).entity('subject').dimension(
                    'anatomy').dimension('data_type').fetch()),
            'subject_stats_anatomy_species':
            list(
                StatsQuery(self).entity('subject').dimension(
                    'anatomy').dimension('species').fetch()),
            'subject_stats_anatomy_project':
            list(
                StatsQuery(self).entity('subject').dimension(
                    'anatomy').dimension('project_root').fetch()),
            'subject_stats_assaytype_datatype':
            list(
                StatsQuery(self).entity('subject').dimension(
                    'assay_type').dimension('data_type').fetch()),
            'subject_stats_assaytype_species':
            list(
                StatsQuery(self).entity('subject').dimension(
                    'assay_type').dimension('species').fetch()),
            'subject_stats_assaytype_project':
            list(
                StatsQuery(self).entity('subject').dimension(
                    'assay_type').dimension('project_root').fetch()),
            'subject_stats_datatype_species':
            list(
                StatsQuery(self).entity('subject').dimension(
                    'data_type').dimension('species').fetch()),
            'subject_stats_datatype_project':
            list(
                StatsQuery(self).entity('subject').dimension(
                    'data_type').dimension('project_root').fetch()),
        }
        print(json.dumps(results, indent=2))

    def list_projects(self,
                      use_root_projects=False,
                      parent_project_RID=None,
                      headers=DEFAULT_HEADERS):
        """Return list of projects AKA funded activities

        :param use_root_projects: Only consider root projects (default False)
        :param parent_project_RID: Only consider children of specified project (default None)
        """
        children = self.builder.CFDE.project.alias("children")
        pip1 = self.builder.CFDE.project_in_project.alias('pip1')
        project = self.builder.CFDE.project
        path = children.path
        path = path.link(
            pip1,
            on=((path.children.id_namespace == pip1.child_project_id_namespace)
                &
                (path.children.local_id == pip1.child_project_local_id))).link(
                    project,
                    on=((pip1.parent_project_id_namespace
                         == project.id_namespace)
                        & (pip1.parent_project_local_id == project.local_id)),
                    join_type='right')

        if use_root_projects:
            path = path.link(self.builder.CFDE.project_root)
        elif parent_project_RID is not None:
            pip2 = self.builder.CFDE.project_in_project.alias('pip2')
            parent = self.builder.CFDE.project.alias("parent")
            path = path.link(
                pip2,
                on=((path.project.id_namespace
                     == pip2.child_project_id_namespace)
                    & (path.project.local_id
                       == pip2.child_project_local_id))).link(
                           parent,
                           on=((path.pip2.parent_project_id_namespace
                                == parent.id_namespace)
                               & (path.pip2.parent_project_local_id
                                  == parent.local_id))).filter(
                                      path.parent.RID == parent_project_RID)

        return path.groupby(path.project.RID, ).attributes(
            path.project.id_namespace, path.project.local_id,
            path.project.column_definitions['name'], path.project.abbreviation,
            path.project.description,
            CntD(path.children.RID).alias('num_subprojects')).fetch(
                headers=headers)

    def list_datatypes(self, headers=DEFAULT_HEADERS):
        """Return list of data_type terms
        """
        return self.builder.CFDE.data_type.path.entities().fetch(
            headers=headers)

    def list_formats(self, headers=DEFAULT_HEADERS):
        """Return list of file format terms
        """
        return self.builder.CFDE.file_format.path.entities().fetch(
            headers=headers)
示例#13
0
class DashboardQueryHelper(object):
    def __init__(self, hostname, catalogid, scheme='https'):
        self.catalog = ErmrestCatalog(scheme, hostname, catalogid)
        self.builder = self.catalog.getPathBuilder()

    def run_demo(self):
        """Run each example query and dump all results as JSON."""
        # use list() to convert each ResultSet
        # for easier JSON serialization...
        results = {
            'list_programs':
            list(self.list_programs()),
            'list_infotypes':
            list(self.list_infotypes()),
            'list_formats':
            list(self.list_formats()),
            'list_program_file_stats':
            list(self.list_program_file_stats()),
            'list_program_sampletype_file_stats':
            list(self.list_program_sampletype_file_stats()),
            'list_program_file_stats_by_time_bin':
            list(self.list_program_file_stats_by_time_bin()),
            'running_sum_program_file_stats':
            list(self.running_sum_program_file_stats()),
        }
        print(json.dumps(results, indent=2))

    def list_programs(self):
        """Return list of common fund programs

        NOTE: in demo content, program 'name' is NOT unique, e.g. GTEx
        occurs twice due to overlap in imports!  The 'id' column is
        unique.

        """
        # trivial case: just return entities of a single table
        return self.builder.CFDE.common_fund_program.path.entities().fetch()

    def list_infotypes(self):
        """Return list of information type terms
        """
        return self.builder.CFDE.information_type.path.entities().fetch()

    def list_formats(self):
        """Return list of file format terms
        """
        return self.builder.CFDE.file_format.path.entities().fetch()

    def list_program_file_stats(self, programid=None):
        """Return list of file statistics per program.

        Optionally filtered to a single programid.

        NOTE: this query will not return a row for a program with zero files...

        NOTE: only non-null File.length values are summed, so null may
        be returned if none of the files have specified a length...

        """
        # more complex case: build joined table path
        path = self.builder.CFDE.dataset.path
        if programid is not None:
            path.filter(path.dataset.data_source == programid)
        path.link(self.builder.CFDE.files_in_datasets)
        path.link(self.builder.CFDE.file)
        # and return grouped aggregate results
        results = path.groupby(path.dataset.data_source, ).attributes(
            Cnt(path.file).alias('file_cnt'),
            Sum(path.file.length).alias('byte_cnt'),
        )
        return results.fetch()

    def list_program_sampletype_file_stats(self, programid=None):
        """Return list of file statistics per (program, sample_type).

        Like list_program_file_stats, but also include biosample
        sample_type in the group key, for more detailed result
        cagegories.

        """
        path = self.builder.CFDE.sample_type.path
        path.link(self.builder.CFDE.bio_sample)
        path.link(self.builder.CFDE.assayed_by)
        path.link(self.builder.CFDE.data_event)
        path.link(self.builder.CFDE.generated_by)
        # right-outer join so we can count files w/o this biosample/event linkage
        path.link(self.builder.CFDE.file,
                  on=(path.GeneratedBy.file_id == self.builder.CFDE.file.id),
                  join_type='right')
        path.link(self.builder.CFDE.files_in_datasets)
        path.link(self.builder.CFDE.dataset)

        if programid is not None:
            path.filter(path.dataset.data_source == programid)

        results = path.groupby(
            # compound grouping key
            path.dataset.data_source,
            path.bio_sample.sample_type.alias('sample_type_id'),
        ).attributes(
            # 'name' is part of Table API so we cannot use attribute-based lookup...
            path.sample_type.column_definitions['name'].alias(
                'sample_type_name'),
            Cnt(path.file).alias('file_cnt'),
            Sum(path.file.length).alias('byte_cnt'),
        )

        return results.fetch()

    def list_program_file_stats_by_time_bin(self,
                                            nbins=100,
                                            min_ts=None,
                                            max_ts=None):
        """Return list of file statistics per (data_source, ts_bin)

        :param nbins: The number of bins to divide the time range
        :param min_ts: The lower (closed) bound of the time range
        :param max_ts: The upper (open) bound of the time range

        If min_ts or max_ts are unspecified, preliminary queries are
        performed to determine the actual timestamp range found in the
        source data. These values are used to configure the binning
        distribution.

        Files generation times are found from DataEvent.event_ts where
        linked to File by the GeneratedBy association. Files without
        such linkage are considered to have null event times.

        Results are keyed by data_source and ts_bin group keys.

        NOTE: Results are sparse! Groups are only returned when at
        least one matching row is found. This means that some bins,
        described next, may be absent in a particular query result.

        Each group includes a ts_bin field which is a three-element
        list describing the time bin:

           [ bin_number, lower_bound, upper_bound ]

        The files within the selected range will be summarized in groups
        with bins:

           [ 1, min_ts, (max_ts - min_ts)/nbins ]
           ...
           [ nbins, max_ts - (max_ts - min_ts)/nbins, max_ts ]

        Files without known event_ts will be summarized in a row with
        a special null bin:

        Other files will be summarized in rows with special bins:

           [ null, null, null ]
           [ 0, null, min_ts ]
           [ nbins+1, max_ts, null ]

        i.e. for files with unknown event_ts, with event_ts below
        min_ts, or with event_ts above max_ts, respectively.

        """
        path = self.builder.CFDE.data_event.path
        path.link(self.builder.CFDE.generated_by)
        # right-outer join so we can count files w/o this dataevent linkage
        path.link(self.builder.CFDE.file,
                  on=(path.generated_by.file_id == self.builder.CFDE.file.id),
                  join_type='right')
        path.link(self.builder.CFDE.files_in_datasets)
        path.link(self.builder.CFDE.dataset)

        # build this list once so we can reuse it for grouping and sorting
        groupkey = [
            path.dataset.data_source,
            Bin(path.data_event.event_ts, nbins, min_ts,
                max_ts).alias('ts_bin'),
        ]

        results = path.groupby(*groupkey).attributes(
            Cnt(path.file.id).alias('file_cnt'),
            Sum(path.file.length).alias('byte_cnt'),
        ).sort(*groupkey)
        return results.fetch()

    def running_sum_program_file_stats(self,
                                       nbins=100,
                                       min_ts=None,
                                       max_ts=None):
        """Transform results of list_program_file_stats_by_time to produce running sums

        The underlying query counts files and sums bytecounts only
        within each time bin. I.e. it represents change rather than
        total data capacities at given times.

        This function accumulates values to show total capacity trends.

        """
        data_source = None
        file_cnt = None
        byte_cnt = None
        # because underlying query results are sorted, we can just iterate...
        for row in self.list_program_file_stats_by_time_bin(
                nbins, min_ts, max_ts):
            if data_source != row['data_source']:
                # reset state for next group
                data_source = row['data_source']
                file_cnt = 0
                byte_cnt = 0
            if row['file_cnt'] is not None:
                file_cnt += row['file_cnt']
            if row['byte_cnt'] is not None:
                byte_cnt += row['byte_cnt']
            yield {
                'data_source': data_source,
                'ts_bin': row['ts_bin'],
                'file_cnt': file_cnt,
                'byte_cnt': byte_cnt
            }