示例#1
0
def audit_file_processed_derived_from(value, system):
    if value['output_category'] in ['raw data', 'reference']:
        return
    if 'derived_from' not in value or \
       'derived_from' in value and len(value['derived_from']) == 0:
        detail = 'derived_from is a list of files that were used to create a given file; ' + \
                 'for example, fastq file(s) will appear in the derived_from list of an ' + \
                 'alignments file. ' + \
                 'Processed file {} '.format(value['@id']) + \
                 'is missing the requisite file specification in its derived_from list.'
        yield AuditFailure('missing derived_from',
                           detail,
                           level='INTERNAL_ACTION')
        return

    if value['file_format'] != 'bam':
        return
    # Ignore replaced BAMs because missing derived_from logic should be applied to their
    # replacements instead (ENCD-3595).
    if value['status'] == 'replaced':
        return

    fastq_bam_counter = 0
    for f in value.get('derived_from'):
        if (f['file_format'] == 'bam' or f['file_format'] == 'fastq'
                or (f['file_format'] in ['fasta', 'csfasta', 'csqual']
                    and f['output_type'] == 'reads'
                    and f['output_category'] == 'raw data')):

            # Audit shouldn't trigger if status isn't registered in STATUS_LEVEL dict.
            if f['status'] not in STATUS_LEVEL or value[
                    'status'] not in STATUS_LEVEL:
                return

            if STATUS_LEVEL[f['status']] >= STATUS_LEVEL[value['status']]:
                fastq_bam_counter += 1

            if f['dataset'] != value['dataset'].get('@id'):
                detail = 'derived_from is a list of files that were used ' + \
                         'to create a given file; ' + \
                         'for example, fastq file(s) will appear in the ' + \
                         'derived_from list of an ' + \
                         'alignments file. ' + \
                         'Alignments file {} '.format(value['@id']) + \
                         'from experiment {} '.format(value['dataset']['@id']) + \
                         'specifies a file {} '.format(f['@id']) + \
                         'from a different experiment {} '.format(f['dataset']) + \
                         'in its derived_from list.'
                yield AuditFailure('inconsistent derived_from',
                                   detail,
                                   level='INTERNAL_ACTION')
    if fastq_bam_counter == 0:
        detail = 'derived_from is a list of files that were used to create a given file; ' + \
                 'for example, fastq file(s) will appear in the derived_from list of an ' + \
                 'alignments file. ' + \
                 'Alignments file {} '.format(value['@id']) + \
                 'is missing the requisite file specification in its derived_from list.'
        yield AuditFailure('missing derived_from',
                           detail,
                           level='INTERNAL_ACTION')
示例#2
0
def audit_item_schema(value, system):
    context = system['context']
    registry = system['registry']
    if not context.schema:
        return

    properties = context.properties.copy()
    current_version = properties.get('schema_version', '')
    target_version = context.type_info.schema_version
    if target_version is not None and current_version != target_version:
        upgrader = registry[UPGRADER]
        try:
            properties = upgrader.upgrade(
                context.type_info.name, properties, current_version, target_version,
                finalize=False, context=context, registry=registry)
        except RuntimeError:
            raise
        except Exception as e:
            detail = '%r upgrading from %r to %r' % (
                e, current_version, target_version)
            yield AuditFailure('upgrade failure', detail, level='INTERNAL_ACTION')
            return

        properties['schema_version'] = target_version

    properties['uuid'] = str(context.uuid)
    validated, errors = validate(context.schema, properties, properties)
    for error in errors:
        category = 'validation error'
        path = list(error.path)
        if path:
            category += ': ' + '/'.join(str(elem) for elem in path)
        detail = 'Object {} has schema error {}'.format(
            value['@id'], error.message)
        yield AuditFailure(category, detail, level='INTERNAL_ACTION')
示例#3
0
def audit_antibody_missing_characterizations(value, system):
    '''
    Check to see what characterizations are lacking for each antibody,
    for the cell lines we know about.
    '''
    for t in value['targets']:
        if 'control' in t.get('investigated_as'):
            return

    if not value['characterizations']:
        detail = '{} does not have any supporting characterizations submitted.'.format(value['@id'])
        yield AuditFailure('no characterizations submitted', detail, level='NOT_COMPLIANT')
        return

    primary_chars = []
    secondary_chars = []
    compliant_secondary = False

    for char in value['characterizations']:
        if 'primary_characterization_method' in char:
            primary_chars.append(char)
        if 'secondary_characterization_method' in char:
            secondary_chars.append(char)
            if char['status'] in ['compliant', 'exempt from standards']:
                compliant_secondary = True

    if not primary_chars:
        detail = '{} does not have any primary characterizations submitted.'.format(value['@id'])
        yield AuditFailure('no primary characterizations', detail, level='NOT_COMPLIANT')

    if not secondary_chars:
        detail = '{} does not have any secondary characterizations submitted.'.format(value['@id'])
        yield AuditFailure('no secondary characterizations', detail, level='NOT_COMPLIANT')

    for lot_review in value['lot_reviews']:
        if lot_review['detail'] in \
            ['Awaiting a compliant primary and pending review of a secondary characterization.',
             'Awaiting a compliant primary and secondary characterization was not reviewed.',
             'Awaiting a compliant primary and submission of a secondary characterization.',
             'Awaiting a compliant primary characterization.',
             'Awaiting compliant primary and secondary characterizations.',
             'Primary characterization not reviewed and awaiting a compliant secondary characterization.',
             'Primary characterization not reviewed and pending review of a secondary characterization.',
             'Primary characterization not reviewed.',
             'Pending review of primary and secondary characterizations.',
             'Pending review of primary characterization and awaiting a compliant secondary characterization.',
             'Pending review of primary characterization and secondary characterization not reviewed.',
             'Pending review of primary characterization.']:
            biosample = lot_review['biosample_term_name']
            if biosample == 'any cell type or tissue':
                biosample = 'one or more cell types/tissues.'

            detail = '{} needs a compliant primary in {}'.format(value['@id'], biosample)
            yield AuditFailure('need compliant primaries', detail, level='NOT_COMPLIANT')

    if secondary_chars and not compliant_secondary:
        detail = '{} needs a compliant secondary characterization.'.format(value['@id'])
        yield AuditFailure('need compliant secondary', detail, level='NOT_COMPLIANT')
        return
示例#4
0
def audit_reference_epigenome_donor_biosample(value, system):
    if value['status'] in ['deleted', 'replaced', 'revoked']:
        return

    if 'related_datasets' not in value:
        return

    treatments_set = set()
    biosample_name_set = set()
    donor_set = set()
    for assay in value['related_datasets']:
        if assay['status'] not in ['deleted', 'replaced', 'revoked']:
            if 'replicates' in assay:
                for rep in assay['replicates']:
                    if rep['status'] not in ['deleted'] and \
                       'library' in rep and 'biosample' in rep['library']:
                        biosample_object = rep['library']['biosample']
                        if 'biosample_term_name' in biosample_object:
                            biosample_name_set.add(
                                biosample_object['biosample_term_name'])
                        if 'donor' in biosample_object:
                            donor_set.add(
                                biosample_object['donor']['accession'])
                        if 'treatments' in biosample_object:
                            if len(biosample_object['treatments']) == 0:
                                treatments_set.add('untreated')
                            else:
                                treatments_to_add = []
                                for t in biosample_object['treatments']:
                                    treatments_to_add.append(
                                        'treated with ' +
                                        t['treatment_term_name'])
                                treatments_set.add(', '.join(
                                    sorted(treatments_to_add)))
                        else:
                            treatments_set.add('untreated')
    if len(treatments_set) > 1:
        detail = 'Reference Epigenome {} '.format(value['@id']) + \
                 ' has biosample associated with different tretments {}.'.format(treatments_set)
        yield AuditFailure(
            'multiple biosample treatments in reference epigenome',
            detail,
            level='WARNING')

    if len(biosample_name_set) > 1:
        detail = 'Reference Epigenome {} '.format(value['@id']) + \
                 ' has multiple biosample term names {}.'.format(biosample_name_set)
        yield AuditFailure(
            'multiple biosample term names in reference epigenome',
            detail,
            level='WARNING')
    if len(donor_set) > 1:
        detail = 'Reference Epigenome {} '.format(value['@id']) + \
                 ' has multiple donors {}.'.format(donor_set)
        yield AuditFailure('multiple donors in reference epigenome',
                           detail,
                           level='WARNING')
    return
示例#5
0
def audit_biosample_term(value, system):
    '''
    Biosample_term_id and biosample_term_name
    and biosample_type should all be present.
    This should be handled by schemas.
    Biosample_term_id should be in the ontology.
    Biosample_term_name should match biosample_term_id.
    '''

    if value['status'] in ['deleted']:
        return

    if 'biosample_term_id' not in value:
        return

    ontology = system['registry']['ontology']
    term_id = value['biosample_term_id']
    term_name = value.get('biosample_term_name')

    if term_id.startswith('NTR:'):
        detail = 'Biosample {} has a New Term Request {} - {}'.format(
            value['@id'], term_id, term_name)
        yield AuditFailure('NTR biosample', detail, level='INTERNAL_ACTION')
        return

    biosample_prefix = term_id.split(':')[0]
    if biosample_prefix not in biosampleType_ontologyPrefix[
            value['biosample_type']]:
        detail = 'Biosample {} of '.format(value['@id']) + \
                 'type {} '.format(value['biosample_type']) + \
                 'has biosample_term_id {} '.format(value['biosample_term_id']) + \
                 'that is not one of ' + \
                 '{}'.format(biosampleType_ontologyPrefix[value['biosample_type']])
        yield AuditFailure('biosample term-type mismatch',
                           detail,
                           level='INTERNAL_ACTION')
        return

    if term_id not in ontology:
        detail = 'Biosample {} has biosample_term_id of {} which is not in ontology'.format(
            value['@id'], term_id)
        yield AuditFailure('term_id not in ontology',
                           term_id,
                           level='INTERNAL_ACTION')
        return

    ontology_term_name = ontology[term_id]['name']
    if ontology_term_name != term_name and term_name not in ontology[term_id][
            'synonyms']:
        detail = 'Biosample {} has '.format(value['@id']) + \
                 'a mismatch between biosample term_id ({}) '.format(term_id) + \
                 'and term_name ({}), ontology term_name for term_id {} '.format(
                     term_name, term_id) + \
                 'is {}.'.format(ontology_term_name)
        yield AuditFailure('inconsistent ontology term', detail, level='ERROR')
        return
def audit_antibody_characterization_target(value, system):
    '''
    Make sure that target in characterization
    matches target of antibody
    '''
    antibody = value['characterizes']
    target = value['target']
    if 'recombinant protein' in target['investigated_as']:
        prefix = target['label'].split('-')[0]
        unique_antibody_target = set()
        unique_investigated_as = set()
        for antibody_target in antibody['targets']:
            label = antibody_target['label']
            unique_antibody_target.add(label)
            for investigated_as in antibody_target['investigated_as']:
                unique_investigated_as.add(investigated_as)
        if ('tag' not in unique_investigated_as
                and 'synthetic tag' not in unique_investigated_as):
            detail = (
                'Antibody {} is not for a tagged protein, yet target {} in antibody characterization {} is investigated_as a recombinant protein.'
                .format(
                    audit_link(path_to_text(antibody['@id']), antibody['@id']),
                    prefix,
                    audit_link(path_to_text(value['@id']), value['@id']),
                ))
            raise AuditFailure('not tagged antibody', detail, level='ERROR')
        else:
            if prefix not in unique_antibody_target:
                detail = (
                    '{} is not found in target list in antibody characterization {} for antibody {}'
                    .format(
                        prefix.capitalize(),
                        audit_link(path_to_text(value['@id']), value['@id']),
                        audit_link(path_to_text(antibody['@id']),
                                   antibody['@id'])))
                raise AuditFailure('mismatched tag target',
                                   detail,
                                   level='ERROR')
    else:
        target_matches = False
        antibody_targets = []
        for antibody_target in antibody['targets']:
            antibody_targets.append(antibody_target.get('name'))
            if target['name'] == antibody_target.get('name'):
                target_matches = True
        if not target_matches:
            antibody_targets_string = str(antibody_targets).replace('\'', '')
            detail = (
                'Antibody characterization {} target is {}, but it could not be found in antibody\'s {} target list {}.'
                .format(
                    audit_link(path_to_text(value['@id']),
                               value['@id']), target['name'],
                    audit_link(path_to_text(antibody['@id']), antibody['@id']),
                    antibody_targets_string))
            raise AuditFailure('inconsistent target', detail, level='ERROR')
示例#7
0
def audit_paired_with(value, system):
    '''
    A file with a paired_end needs a paired_with.
    Should be handled in the schema.
    A paired_with should be the same replicate
    '''

    if 'paired_end' not in value:
        return

    if value['paired_end'] in ['1,2']:
        return

    if 'paired_with' not in value:
        return

    if 'replicate' not in value['paired_with']:
        return

    if 'replicate' not in value:
        detail = 'File {} has paired_end = {}. It requires a replicate'.format(
            value['@id'],
            value['paired_end'])
        yield AuditFailure('missing replicate', detail, level='INTERNAL_ACTION')
    elif value['replicate'].get('@id') != value['paired_with']['replicate']:
        detail = 'File {} has replicate {}. It is paired_with file {} with replicate {}'.format(
            value['@id'],
            value['replicate'].get('@id'),
            value['paired_with']['@id'],
            value['paired_with'].get('replicate'))
        yield AuditFailure('inconsistent paired_with', detail, level='ERROR')

    if value['paired_end'] == '1':
        context = system['context']
        paired_with = context.get_rev_links('paired_with')
        if len(paired_with) > 1:
            detail = 'Paired end 1 file {} paired_with by multiple paired end 2 files: {!r}'.format(
                value['@id'],
                paired_with
            )
            yield AuditFailure('multiple paired_with', detail, level='ERROR')
            return

    file_read_count = value.get('read_count')
    paired_with_read_count = value['paired_with'].get('read_count')

    if (file_read_count and paired_with_read_count) and (file_read_count != paired_with_read_count):
        detail = ('File {} has {} reads. It is'
                  ' paired_with file {} that has {} reads').format(
                      value['@id'],
                      file_read_count,
                      value['paired_with']['@id'],
                      paired_with_read_count)
        yield AuditFailure('inconsistent read count', detail, level='ERROR')
示例#8
0
def audit_item_status(value, system):
    if 'status' not in value:
        return

    level = STATUS_LEVEL.get(value['status'], 50)

    if level == 0:
        return

    if value['status'] in ['revoked', 'archived']:
        level += 50

    context = system['context']
    request = system['request']
    linked = set()

    for schema_path in context.type_info.schema_links:
        if schema_path in [
                'supersedes', 'step_run', 'derived_from', 'controlled_by',
                'possible_controls'
        ]:
            continue
        else:
            linked.update(simple_path_ids(value, schema_path))

    for path in linked:
        linked_value = request.embed(path + '@@object')
        if 'status' not in linked_value:
            continue
        if linked_value['status'] == 'disabled':
            continue
        if (  # Special case: A revoked file can have a deleted replicate ticket #2938
                'File' in value['@type'] and value['status'] == 'revoked'
                and 'Replicate' in linked_value['@type']
                and linked_value['status'] == 'deleted'):
            continue
        linked_level = STATUS_LEVEL.get(linked_value['status'], 50)
        if linked_value['status'] in ['revoked', 'archived']:
            linked_level += 50
        if linked_level == 0:
            detail = '{} {} has {} subobject {}'.format(
                value['status'], value['@id'], linked_value['status'],
                linked_value['@id'])
            yield AuditFailure('mismatched status',
                               detail,
                               level='INTERNAL_ACTION')
        elif linked_level < level:
            detail = '{} {} has {} subobject {}'.format(
                value['status'], value['@id'], linked_value['status'],
                linked_value['@id'])
            yield AuditFailure('mismatched status',
                               detail,
                               level='INTERNAL_ACTION')
示例#9
0
def audit_biosample_modifications(value, system):

    if value['biosample_ontology']['classification'] == 'whole organisms':
        model_modifications_present = True
        model_modifications_ids = set()
        modifications_ids = set()
        if 'model_organism_donor_modifications' in value:
            for model_modification in value[
                    'model_organism_donor_modifications']:
                model_modifications_ids.add(model_modification)
        else:
            model_modifications_present = False
        if 'genetic_modifications' in value:
            for modification in value['genetic_modifications']:
                modifications_ids.add(modification)

        modification_difference = modifications_ids - model_modifications_ids

        if modification_difference and model_modifications_present:
            mod_diff_links = [
                audit_link(path_to_text(m), m) for m in modification_difference
            ]
            model_mod_links = [
                audit_link(path_to_text(n), n) for n in model_modifications_ids
            ]
            detail = (
                'Biosample {} contains '
                ' genetic modificatons {} that are not present'
                ' in the list of genetic modifications {} of the corresponding strain.'
                .format(audit_link(path_to_text(value['@id']), value['@id']),
                        ', '.join(mod_diff_links), ', '.join(model_mod_links)))
            yield AuditFailure('mismatched genetic modifications',
                               detail,
                               level='INTERNAL_ACTION')
        modification_duplicates = model_modifications_ids & modifications_ids
        mod_dup_links = [
            audit_link(path_to_text(d), d) for d in modification_duplicates
        ]
        model_mod_links = [
            audit_link(path_to_text(n), n) for n in model_modifications_ids
        ]
        if modification_duplicates:
            detail = ('Biosample {} contains '
                      'genetic modifications {} that '
                      'are duplicates of genetic modifications {} '
                      'of the corresponding strain.'.format(
                          audit_link(path_to_text(value['@id']),
                                     value['@id']), ', '.join(mod_dup_links),
                          ', '.join(model_mod_links)))
            yield AuditFailure('duplicated genetic modifications',
                               detail,
                               level='INTERNAL_ACTION')
    return
示例#10
0
def audit_term(value, system):
    '''
    The classification, term_id and term_name should all be present.
    This should be handled by schemas.
    The term_id should be in the ontology.
    The term_name should match the term_id.
    '''

    if value['status'] in ['deleted']:
        return

    ontology = system['registry']['ontology']
    term_name = value['term_name']
    term_id = value['term_id']

    if term_id.startswith('NTR:'):
        detail = ('BiosampleType {} has a New Term Request {} - {}.'.format(
            audit_link(path_to_text(value['@id']), value['@id']),
            term_id,
            term_name
            )
        )
        yield AuditFailure('NTR biosample', detail, level='INTERNAL_ACTION')
        return

    if term_id not in ontology:
        detail = ('BiosampleType {} specifies a term_id {} '
            'that is not part of the {} ontology.'.format(
                audit_link(path_to_text(value['@id']), value['@id']),
                term_id,
                term_id.split(':', 1)[0]
            )
        )
        yield AuditFailure('term_id not in ontology', detail,
                           level='INTERNAL_ACTION')
        return

    ontology_term_name = ontology[term_id]['name']
    if (ontology_term_name != term_name
        and term_name not in ontology[term_id]['synonyms']):
        detail = ('BiosampleType {object_id} has a mismatch between'
            ' term_id ({term_id}) and term_name ({term_name}),'
            ' ontology term_name for term_id {term_id} is'
            ' {ontology_term_name}.'.format(
                object_id=audit_link(path_to_text(value['@id']), value['@id']),
                term_id=term_id,
                term_name=term_name,
                ontology_term_name=ontology_term_name
            )
        )
        yield AuditFailure('inconsistent ontology term', detail,
                           level='ERROR')
示例#11
0
def audit_read_structure(value, system):
    read_structure = value.get('read_structure', [])
    for element in read_structure:
        if element['start'] == 0 or element['end'] == 0:
            detail = ('The read_stucture is 1-based. '
                      'Neither start or end can be 0 for sequence element {}.'.
                      format(element['sequence_element']))
            yield AuditFailure('invalid read_structure', detail, level='ERROR')
        if element['start'] > element['end']:
            detail = ('The start coordinate is bigger than the end coordinate '
                      'for sequence element {}.'.format(
                          element['sequence_element']))
            yield AuditFailure('invalid read_structure', detail, level='ERROR')
def audit_antibody_characterization_review(value, system):
    '''
    Make sure that biosample terms are in ontology
    for each characterization_review.
    '''
    if (value['status'] in ['not reviewed', 'not submitted for review by lab', 'deleted', 'in progress']):
        return

    if 'secondary_characterization_method' in value:
        return

    if value['characterization_reviews']:
        ontology = system['registry']['ontology']
        for review in value['characterization_reviews']:
            term_id = review['biosample_term_id']
            term_name = review['biosample_term_name']
            term_type = review['biosample_type']

            if term_id.startswith('NTR:'):
                detail = '{} contains a New Term Request {} - {}'.format(
                    value['@id'],
                    term_id,
                    term_name
                    )
                yield AuditFailure('NTR biosample', detail, level='INTERNAL_ACTION')
                return
            if term_id not in ontology:
                detail = 'Antibody characterization {} contains '.format(value['@id']) + \
                         'a biosample_term_id {} that is not in the ontology'.format(term_id)

                yield AuditFailure('term_id not in ontology', term_id, level='INTERNAL_ACTION')
                return
            ontology_term_name = ontology[term_id]['name']
            if ontology_term_name != term_name and term_name not in ontology[term_id]['synonyms']:
                detail = 'Antibody characterization {} '.format(value['@id']) + \
                         'has a mismatched term {} - {} expected {}'.format(term_id,
                                                                            term_name,
                                                                            ontology_term_name)

                yield AuditFailure('inconsistent ontology term', detail, level='ERROR')
                return
            biosample_prefix = term_id.split(':')[0]
            if biosample_prefix not in biosampleType_ontologyPrefix[review['biosample_type']]:
                detail = 'Antibody characterization {} is '.format(value['@id']) + \
                         'of type {} '.format(term_type) + \
                         'and has biosample_term_id {} '.format(term_id) + \
                         'that is not one of ' + \
                         '{}'.format(biosampleType_ontologyPrefix[term_type])
                yield AuditFailure('characterization review with biosample term-type mismatch', detail,
                                   level='INTERNAL_ACTION')
                return
示例#13
0
def audit_paired_with(value, system):
    '''
    A file with a paired_end needs a paired_with.
    Should be handled in the schema.
    A paired_with should be the same replicate
    '''

    if value['status'] in ['deleted', 'replaced', 'revoked']:
        return

    if 'paired_end' not in value:
        return

    if 'paired_with' not in value:
        paired_number = "2"
        if value['paired_end'] == "2":
            paired_number = "1"
        detail = 'Sequencing read{} file {} is the result of a '.format(
            value['paired_end'],
            value['@id']) + \
            'paired-end sequencing run according to the submitted metadata. ' + \
            'An association with a read{} file needs to be specified.'.format(
                paired_number)
        raise AuditFailure('missing paired_with', detail, level='ERROR')

    if 'replicate' not in value['paired_with']:
        return

    if 'replicate' not in value:
        detail = 'File {} has paired_end = {}. It requires a replicate'.format(
            value['@id'],
            value['paired_end'])
        raise AuditFailure('missing replicate', detail, level='INTERNAL_ACTION')

    if value['replicate'] != value['paired_with']['replicate']:
        detail = 'File {} has replicate {}. It is paired_with file {} with replicate {}'.format(
            value['@id'],
            value.get('replicate'),
            value['paired_with']['@id'],
            value['paired_with'].get('replicate'))
        raise AuditFailure('inconsistent paired_with', detail, level='ERROR')

    if value['paired_end'] == '1':
        context = system['context']
        paired_with = context.get_rev_links('paired_with')
        if len(paired_with) > 1:
            detail = 'Paired end 1 file {} paired_with by multiple paired end 2 files: {!r}'.format(
                value['@id'],
                paired_with,
            )
            raise AuditFailure('multiple paired_with', detail, level='ERROR')
示例#14
0
def audit_biosample_donor(value, system):
    '''
    A biosample should have a donor.
    The organism of donor and biosample should match.
    '''
    if value['status'] in ['deleted']:
        return

    if 'donor' not in value:
        detail = 'Biosample {} is not associated with any donor.'.format(
            value['@id'])
        if 'award' in value and 'rfa' in value['award'] and \
           value['award']['rfa'] == 'GGR':
            yield AuditFailure('missing donor',
                               detail,
                               level='INTERNAL_ACTION')
            return
        else:
            yield AuditFailure('missing donor', detail, level='ERROR')
            return

    donor = value['donor']
    if value.get('organism') != donor.get('organism'):
        detail = 'Biosample {} is organism {}, yet its donor {} is organism {}. Biosamples require a donor of the same species'.format(
            value['@id'], value.get('organism'), donor['@id'],
            donor.get('organism'))
        yield AuditFailure('inconsistent organism', detail, level='ERROR')

    if 'mutated_gene' not in donor:
        return

    if value.get('organism') != donor['mutated_gene'].get('organism'):
        detail = 'Biosample {} is organism {}, but its donor {} mutated_gene is in {}. Donor mutated_gene should be of the same species as the donor and biosample'.format(
            value['@id'], value.get('organism'), donor['@id'],
            donor['mutated_gene'].get('organism'))
        yield AuditFailure('inconsistent mutated_gene organism',
                           detail,
                           level='ERROR')

    for i in donor['mutated_gene'].get('investigated_as'):
        if i in [
                'tag', 'control', 'recombinant protein',
                'nucleotide modification',
                'other post-translational modification'
        ]:
            detail = 'Donor {} has an invalid mutated_gene {}. Donor mutated_genes should not be tags, controls, recombinant proteins or modifications'.format(
                donor['@id'], donor['mutated_gene'].get('name'))
            yield AuditFailure('invalid donor mutated_gene',
                               detail,
                               level='ERROR')
示例#15
0
def audit_fly_worm_donor_genotype_dbxrefs(value, system):
    '''
    Fly and worm donors need their genotype information and dbxrefs
    filled out since the genotype will ge part of the biosample summary.
    '''
    if ('FlyDonor' in value['@type']) or ('WormDonor' in value['@type']):
        if 'genotype' not in value or not value['genotype']:
            detail = 'Strain {} should have a value '.format(value['@id']) + \
                'specified for genotype.'
            yield AuditFailure('missing genotype', detail, level='WARNING')
        if not value['dbxrefs']:
            detail = 'Strain {} should have one or more ids '.format(value['@id']) + \
                'specified in the dbxrefs array.'
            yield AuditFailure('missing dbxrefs', detail, level='WARNING')
示例#16
0
def audit_file_matching_md5sum(value, system):
    '''
    Files with md5 sums matching other files should be marked with a WARNING audit.
    If the other files are listed as matching but in fact have different md5 sums,
    the file should be flagged with an ERROR for incorrect metadata.
    '''

    matching_files = []
    checked_statuses = ['released', 'revoked', 'archived', 'in progress']
    if 'matching_md5sum' not in value or value.get(
            'status') not in checked_statuses:
        return

    for file in value.get('matching_md5sum'):
        if file.get('uuid') == value.get('uuid'):
            detail = ('File {} is listing itself as having '
                      'a matching md5 sum.'.format(
                          audit_link(path_to_text(value['@id']),
                                     value['@id'])))
            yield AuditFailure('inconsistent matching_md5sum',
                               detail,
                               level='ERROR')
        if file.get('md5sum') != value.get('md5sum'):
            detail = (
                'File {} is listed as having a matching md5 sum '
                'as file {}, but the files have different md5 sums.'.format(
                    audit_link(path_to_text(file['@id']), file['@id']),
                    audit_link(path_to_text(value['@id']), value['@id'])))
            yield AuditFailure('inconsistent matching_md5sum',
                               detail,
                               level='ERROR')
        elif file.get('status') in checked_statuses:
            matching_files.append(file['@id'])
            matching_files_links = [
                audit_link(path_to_text(file), file) for file in matching_files
            ]

    if not matching_files:
        return
    elif len(matching_files) > 2:
        matching_files_joined = 'Files {}, and {}'.format(
            ', '.join(matching_files_links[:-1]), matching_files_links[-1])
    else:
        matching_files_joined = ' and '.join(matching_files_links)

    detail = ('The md5 sum of file {} '
              'matches that of file(s) {}.'.format(
                  audit_link(path_to_text(value['@id']), value['@id']),
                  matching_files_joined))
    yield AuditFailure('matching md5 sums', detail, level='WARNING')
def audit_antibody_characterization_review(value, system):
    '''
    Make sure that biosample terms are in ontology
    for each characterization_review.
    '''
    if (value['status'] in [
            'not reviewed', 'not submitted for review by lab', 'deleted',
            'in progress'
    ]):
        return

    if 'secondary_characterization_method' in value:
        return

    if value['characterization_reviews']:
        ontology = system['registry']['ontology']
        for review in value['characterization_reviews']:
            term_id = review['biosample_term_id']
            term_name = review['biosample_term_name']
            if term_id.startswith('NTR:'):
                detail = '{} contains a New Term Request {} - {}'.format(
                    value['@id'], term_id, term_name)
                yield AuditFailure('NTR biosample',
                                   detail,
                                   level='INTERNAL_ACTION')
                return
            if term_id not in ontology:
                detail = 'Antibody characterization {} contains '.format(value['@id']) + \
                         'a biosample_term_id {} that is not in the ontology'.format(term_id)

                yield AuditFailure('term_id not in ontology',
                                   term_id,
                                   level='INTERNAL_ACTION')
                return
            ontology_term_name = ontology[term_id]['name']
            if ontology_term_name != term_name and term_name not in ontology[
                    term_id]['synonyms']:
                detail = 'Antibody characterization {} '.format(value['@id']) + \
                         'has a mismatch between biosample term_id ({}) '.format(
                             term_id) + \
                         'and term_name ({}), ontology term_name for term_id {} '.format(
                             term_name,
                             term_id) + \
                         'is {}.'.format(ontology_term_name)
                yield AuditFailure('inconsistent ontology term',
                                   detail,
                                   level='ERROR')
                return
示例#18
0
def audit_biosample_part_of_consistency(value, system):
    if 'part_of' not in value:
        return
    else:
        part_of_biosample = value['part_of']
        term_id = value['biosample_term_id']
        part_of_term_id = part_of_biosample['biosample_term_id']

        if 'biosample_term_name' in value:
            term_name = value['biosample_term_name']
        else:
            term_name = term_id
        if 'biosample_term_name' in part_of_biosample:
            part_of_term_name = part_of_biosample['biosample_term_name']
        else:
            part_of_term_name = part_of_term_id

        if term_id == part_of_term_id or part_of_term_id == 'UBERON:0000468':
            return

        ontology = system['registry']['ontology']
        if (term_id in ontology) and (part_of_term_id in ontology):
            if is_part_of(term_id, part_of_term_id, ontology) is True:
                return

        detail = 'Biosample {} '.format(value['@id']) + \
                 'with biosample term {} '.format(term_name) + \
                 'was separated from biosample {} '.format(part_of_biosample['@id']) + \
                 'with biosample term {}. '.format(part_of_term_name) + \
                 'The {} '.format(term_id) + \
                 'ontology does not note that part_of relationship.'
        yield AuditFailure('inconsistent biosample_term_id',
                           detail,
                           level='INTERNAL_ACTION')
        return
def audit_antibody_characterization_unique_reviews(value, system):
    '''
    Make sure primary characterizations have unique lane, biosample_term_id and
    organism combinations for characterization reviews
    '''
    if (value['status'] in [
            'deleted', 'not submitted for review by lab', 'in progress',
            'not reviewed'
    ]):
        return

    if 'secondary_characterization_method' in value:
        return

    unique_reviews = set()
    for review in value['characterization_reviews']:
        lane = review['lane']
        term_id = review['biosample_term_id']
        organism = review['organism']
        review_lane = frozenset([lane, term_id, organism])
        if review_lane not in unique_reviews:
            unique_reviews.add(review_lane)
        else:
            detail = 'Lane {} in {} is a duplicate review for {} - {}'.format(
                lane, value['@id'], term_id, organism)
            raise AuditFailure('duplicate lane review',
                               detail,
                               level='INTERNAL_ACTION')
示例#20
0
def audit_file_replicate_match(value, system):
    '''
    A file's replicate should belong to the same experiment that the file
    does.  These tend to get confused when replacing objects.
    '''

    if value['status'] in ['deleted', 'replaced', 'revoked']:
        return

    if 'replicate' not in value:
        return

    rep_exp = value['replicate']['experiment']['uuid']
    file_exp = value['dataset']['uuid']

    if rep_exp != file_exp:
        detail = 'File {} from experiment {} '.format(value['@id'], value['dataset']['@id']) + \
                 'is associated with replicate [{},{}] '.format(
                     value['replicate']['biological_replicate_number'],
                     value['replicate']['technical_replicate_number']) + \
                 '{}, but that replicate is associated with a different '.format(
                     value['replicate']['@id']) + \
                 'experiment {}.'.format(value['replicate']['experiment']['@id'])
        yield AuditFailure('inconsistent replicate', detail, level='ERROR')
        return
示例#21
0
def audit_status_replicate(value, system):
    '''
    As the experiment-replicate relationship is reverse calculated, the status checker for item
    is not sufficient to catch all cases of status mismatch between replicates and experiments.
    * in-progress replicate can't have experiment in [proposed, released, deleted, revoked]
    * released or revoked replicate must be in [released or revoked]
    * if experiment is deleted, replicate must be deleted
    '''

    rep_status = value['status']
    exp_status = value['experiment']['status']

    if ((rep_status in ['in progress'] and exp_status in ['released',
                                                          'revoked',
                                                          'proposed',
                                                          'preliminary']) or
        (rep_status in ['released', 'revoked'] and
            exp_status not in ['released', 'revoked']) or
       (exp_status in ['deleted'] and rep_status not in ['deleted'])):
        #  If any of the three cases exist, there is an error
        detail = '{} replicate {} is in {} experiment'.format(
            rep_status,
            value['@id'],
            exp_status
            )
        raise AuditFailure('mismatched status', detail, level='INTERNAL_ACTION')
示例#22
0
def audit_biosample_part_of_consistency(value, system):
    if 'part_of' not in value:
        return
    else:
        part_of_biosample = value['part_of']
        term_id = value['biosample_ontology']['term_id']
        term_name = value['biosample_ontology']['term_name']
        part_of_term_id = part_of_biosample['biosample_ontology']['term_id']
        part_of_term_name = part_of_biosample['biosample_ontology'][
            'term_name']

        if term_id == part_of_term_id or part_of_term_id == 'UBERON:0000468':
            return

        ontology = system['registry']['ontology']
        if (term_id in ontology) and (part_of_term_id in ontology):
            if is_part_of(term_id, part_of_term_id, ontology) is True:
                return

        detail = ('Biosample {} with biosample term {} '
                  'was separated from biosample {} '
                  'with biosample term {}. The {} '
                  'ontology does not note that part_of relationship.'.format(
                      audit_link(path_to_text(value['@id']),
                                 value['@id']), term_name,
                      audit_link(path_to_text(part_of_biosample['@id']),
                                 part_of_biosample['@id']), part_of_term_name,
                      term_id))
        yield AuditFailure('inconsistent BiosampleType term',
                           detail,
                           level='INTERNAL_ACTION')
        return
示例#23
0
def audit_duplicate_quality_metrics(value, system):
    quality_metrics = value.get('quality_metrics')
    if not quality_metrics:
        return
    metric_signatures = []
    audit_signatures = []
    for metric in quality_metrics:
        metric_type = metric.get('@type', [None])[0]
        signature = (
            metric_type,
            metric.get('processing_stage')
        )
        if signature not in metric_signatures:
            metric_signatures.append(signature)
        elif signature not in audit_signatures:
            # Add so only yields audit once per signature per file.
            audit_signatures.append(signature)
            detail = ('File {} has more than one {} quality metric.'.format(
                audit_link(path_to_text(value.get('@id')), value.get('@id')),
                metric_type
                )
            )
            yield AuditFailure(
                'duplicate quality metric',
                detail,
                level='INTERNAL_ACTION'
            )
示例#24
0
def audit_treatment_concentration_series_mixed_units(value, system):
    if value['status'] in ['deleted', 'replaced', 'revoked']:
        return

    if 'related_datasets' not in value:
        return

    treatment_amount_units = set()
    for assay in value['related_datasets']:
        if assay['status'] not in ['deleted', 'replaced', 'revoked']:
            if 'replicates' in assay:
                for rep in assay['replicates']:
                    if rep['status'] not in ['deleted'] and \
                       'library' in rep and 'biosample' in rep['library']:
                        biosample_object = rep['library']['biosample']
                        if 'treatments' in biosample_object:
                            if len(biosample_object['treatments']) != 0:
                                for t in biosample_object['treatments']:
                                    treatment_amount_units.add(t['amount_units'])
    if len(treatment_amount_units) > 1:
        detail = (f"Treatments associated with series {audit_link(path_to_text(value['@id']), value['@id'])} "
            f"use inconsistent amount units {treatment_amount_units}."
        )
        yield AuditFailure('inconsistent treatment units',
                           detail, level='INTERNAL_ACTION')
        return
示例#25
0
def audit_file_processed_derived_from(value):
    if value['output_category'] in ['raw data', 'reference']:
        return
    if 'derived_from' not in value or \
       'derived_from' in value and len(value['derived_from']) == 0:
        detail = 'derived_from is a list of files that were used to create a given file; ' + \
                    'for example, fastq file(s) will appear in the derived_from list of an alignments file. ' + \
                    'Processed file {} '.format(value['@id']) + \
                    'is missing the requisite file specification in its derived_from list.'
        yield AuditFailure('missing derived_from',
                           detail,
                           level='INTERNAL_ACTION')
        return

    if value['file_format'] != 'bam':
        return

    derived_from_files = value.get('derived_from')
    fastq_bam_counter = 0
    for f in derived_from_files:
        if (f['file_format'] == 'bam' or f['file_format'] == 'fastq'
                or (f['file_format'] == 'fasta' and f['output_type'] == 'reads'
                    and f['output_category'] == 'raw data')):

            if f['status'] not in ['deleted', 'replaced', 'revoked'] or \
               f['status'] == value['status']:
                fastq_bam_counter += 1

            if f['dataset'] != value['dataset'].get('@id'):
                detail = 'derived_from is a list of files that were used to create a given file; ' + \
                         'for example, fastq file(s) will appear in the derived_from list of an alignments file. ' + \
                         'Alignments file {} '.format(value['@id']) + \
                         'from experiment {} '.format(value['dataset']) + \
                         'specifies a file {} '.format(f['@id']) + \
                         'from a different experiment {} '.format(f['dataset']) + \
                         'in its derived_from list.'
                yield AuditFailure('inconsistent derived_from',
                                   detail,
                                   level='INTERNAL_ACTION')
    if fastq_bam_counter == 0:
        detail = 'derived_from is a list of files that were used to create a given file; ' + \
                 'for example, fastq file(s) will appear in the derived_from list of an alignments file. ' + \
                 'Alignments file {} '.format(value['@id']) + \
                 'is missing the requisite file specification in its derived_from list.'
        yield AuditFailure('missing derived_from',
                           detail,
                           level='INTERNAL_ACTION')
示例#26
0
def audit_file_size(value, system):

    if value['status'] in ['deleted', 'replaced', 'uploading', 'revoked']:
        return

    if 'file_size' not in value:
        detail = 'File {} requires a value for file_size'.format(value['@id'])
        raise AuditFailure('missing file_size', detail, level='INTERNAL_ACTION')
示例#27
0
def audit_biosample_transfection_type(value, system):
    '''
    A biosample with constructs or rnais should have a
    transfection_type
    '''
    if value['status'] == 'deleted':
        return

    if (value['rnais']) and ('transfection_type' not in value):
        detail = 'Biosample {} with a value for RNAi requires transfection_type'.format(
            value['@id'])
        raise AuditFailure('missing transfection_type', detail, level='ERROR')

    if (value['constructs']) and ('transfection_type' not in value):
        detail = 'Biosample {} with a value for construct requires transfection_type'.format(
            value['@id'])
        raise AuditFailure('missing transfection_type', detail, level='ERROR')
def audit_antibody_characterization_status(value, system):
    '''
    Make sure the lane_status matches
    the characterization status
    '''

    if (value['status'] in [
            'not reviewed', 'not submitted for review by lab', 'deleted',
            'in progress'
    ]):
        return

    if 'secondary_characterization_method' in value:
        return
    '''Check each of the lane_statuses in characterization_reviews for an appropriate match'''
    has_compliant_lane = False
    is_pending = False
    if value['status'] == 'pending dcc review':
        is_pending = True
    for lane in value['characterization_reviews']:
        if (is_pending and lane['lane_status'] != 'pending dcc review') or (
                not is_pending
                and lane['lane_status'] == 'pending dcc review'):
            detail = (
                'A lane.status of {} in antibody characterization {} is incompatible with antibody_characterization.status of {}.'
                .format(lane['lane_status'],
                        audit_link(path_to_text(value['@id']), value['@id']),
                        value['status']))
            raise AuditFailure('mismatched lane status',
                               detail,
                               level='INTERNAL_ACTION')
            continue

        if lane['lane_status'] == 'compliant':
            has_compliant_lane = True

    if has_compliant_lane and value['status'] != 'compliant':
        detail = (
            'A lane.status of {} in antibody characterization {} is incompatible with antibody_characterization status of {}.'
            .format(lane['lane_status'],
                    audit_link(path_to_text(value['@id']), value['@id']),
                    value['status']))
        raise AuditFailure('mismatched lane status',
                           detail,
                           level='INTERNAL_ACTION')
示例#29
0
def audit_file_format_specifications(value, system):

    for doc in value.get('file_format_specifications', []):
        if doc['document_type'] != "file format specification":
            detail = 'File {} has document {} not of type file format specification'.format(
                value['@id'], doc['@id'])
            raise AuditFailure('inconsistent document_type',
                               detail,
                               level='ERROR')
示例#30
0
def audit_file_md5sum_integrity(value, system):
    if value['status'] in ['deleted', 'replaced', 'revoked']:
        return
    md5sum = value['md5sum']
    try:
        hexval = int(md5sum, 16)
        if len(md5sum) != 32:
            detail = 'File {} '.format(value['@id']) + \
                     'has an md5sum value of {}, '.format(md5sum) + \
                     'which is not 32 characters long.'
            yield AuditFailure('inconsistent md5sum',
                               detail, level='INTERNAL_ACTION')
    except ValueError:
        detail = 'File {} '.format(value['@id']) + \
                 'has an md5sum value of {}, '.format(md5sum) + \
                 'which is not a valid hexadecimal number.'
        yield AuditFailure('inconsistent md5sum',
                           detail, level='INTERNAL_ACTION')