def audit_file_processed_derived_from(value, system): if value['output_category'] in ['raw data', 'reference']: return if 'derived_from' not in value or \ 'derived_from' in value and len(value['derived_from']) == 0: detail = 'derived_from is a list of files that were used to create a given file; ' + \ 'for example, fastq file(s) will appear in the derived_from list of an ' + \ 'alignments file. ' + \ 'Processed file {} '.format(value['@id']) + \ 'is missing the requisite file specification in its derived_from list.' yield AuditFailure('missing derived_from', detail, level='INTERNAL_ACTION') return if value['file_format'] != 'bam': return # Ignore replaced BAMs because missing derived_from logic should be applied to their # replacements instead (ENCD-3595). if value['status'] == 'replaced': return fastq_bam_counter = 0 for f in value.get('derived_from'): if (f['file_format'] == 'bam' or f['file_format'] == 'fastq' or (f['file_format'] in ['fasta', 'csfasta', 'csqual'] and f['output_type'] == 'reads' and f['output_category'] == 'raw data')): # Audit shouldn't trigger if status isn't registered in STATUS_LEVEL dict. if f['status'] not in STATUS_LEVEL or value[ 'status'] not in STATUS_LEVEL: return if STATUS_LEVEL[f['status']] >= STATUS_LEVEL[value['status']]: fastq_bam_counter += 1 if f['dataset'] != value['dataset'].get('@id'): detail = 'derived_from is a list of files that were used ' + \ 'to create a given file; ' + \ 'for example, fastq file(s) will appear in the ' + \ 'derived_from list of an ' + \ 'alignments file. ' + \ 'Alignments file {} '.format(value['@id']) + \ 'from experiment {} '.format(value['dataset']['@id']) + \ 'specifies a file {} '.format(f['@id']) + \ 'from a different experiment {} '.format(f['dataset']) + \ 'in its derived_from list.' yield AuditFailure('inconsistent derived_from', detail, level='INTERNAL_ACTION') if fastq_bam_counter == 0: detail = 'derived_from is a list of files that were used to create a given file; ' + \ 'for example, fastq file(s) will appear in the derived_from list of an ' + \ 'alignments file. ' + \ 'Alignments file {} '.format(value['@id']) + \ 'is missing the requisite file specification in its derived_from list.' yield AuditFailure('missing derived_from', detail, level='INTERNAL_ACTION')
def audit_item_schema(value, system): context = system['context'] registry = system['registry'] if not context.schema: return properties = context.properties.copy() current_version = properties.get('schema_version', '') target_version = context.type_info.schema_version if target_version is not None and current_version != target_version: upgrader = registry[UPGRADER] try: properties = upgrader.upgrade( context.type_info.name, properties, current_version, target_version, finalize=False, context=context, registry=registry) except RuntimeError: raise except Exception as e: detail = '%r upgrading from %r to %r' % ( e, current_version, target_version) yield AuditFailure('upgrade failure', detail, level='INTERNAL_ACTION') return properties['schema_version'] = target_version properties['uuid'] = str(context.uuid) validated, errors = validate(context.schema, properties, properties) for error in errors: category = 'validation error' path = list(error.path) if path: category += ': ' + '/'.join(str(elem) for elem in path) detail = 'Object {} has schema error {}'.format( value['@id'], error.message) yield AuditFailure(category, detail, level='INTERNAL_ACTION')
def audit_antibody_missing_characterizations(value, system): ''' Check to see what characterizations are lacking for each antibody, for the cell lines we know about. ''' for t in value['targets']: if 'control' in t.get('investigated_as'): return if not value['characterizations']: detail = '{} does not have any supporting characterizations submitted.'.format(value['@id']) yield AuditFailure('no characterizations submitted', detail, level='NOT_COMPLIANT') return primary_chars = [] secondary_chars = [] compliant_secondary = False for char in value['characterizations']: if 'primary_characterization_method' in char: primary_chars.append(char) if 'secondary_characterization_method' in char: secondary_chars.append(char) if char['status'] in ['compliant', 'exempt from standards']: compliant_secondary = True if not primary_chars: detail = '{} does not have any primary characterizations submitted.'.format(value['@id']) yield AuditFailure('no primary characterizations', detail, level='NOT_COMPLIANT') if not secondary_chars: detail = '{} does not have any secondary characterizations submitted.'.format(value['@id']) yield AuditFailure('no secondary characterizations', detail, level='NOT_COMPLIANT') for lot_review in value['lot_reviews']: if lot_review['detail'] in \ ['Awaiting a compliant primary and pending review of a secondary characterization.', 'Awaiting a compliant primary and secondary characterization was not reviewed.', 'Awaiting a compliant primary and submission of a secondary characterization.', 'Awaiting a compliant primary characterization.', 'Awaiting compliant primary and secondary characterizations.', 'Primary characterization not reviewed and awaiting a compliant secondary characterization.', 'Primary characterization not reviewed and pending review of a secondary characterization.', 'Primary characterization not reviewed.', 'Pending review of primary and secondary characterizations.', 'Pending review of primary characterization and awaiting a compliant secondary characterization.', 'Pending review of primary characterization and secondary characterization not reviewed.', 'Pending review of primary characterization.']: biosample = lot_review['biosample_term_name'] if biosample == 'any cell type or tissue': biosample = 'one or more cell types/tissues.' detail = '{} needs a compliant primary in {}'.format(value['@id'], biosample) yield AuditFailure('need compliant primaries', detail, level='NOT_COMPLIANT') if secondary_chars and not compliant_secondary: detail = '{} needs a compliant secondary characterization.'.format(value['@id']) yield AuditFailure('need compliant secondary', detail, level='NOT_COMPLIANT') return
def audit_reference_epigenome_donor_biosample(value, system): if value['status'] in ['deleted', 'replaced', 'revoked']: return if 'related_datasets' not in value: return treatments_set = set() biosample_name_set = set() donor_set = set() for assay in value['related_datasets']: if assay['status'] not in ['deleted', 'replaced', 'revoked']: if 'replicates' in assay: for rep in assay['replicates']: if rep['status'] not in ['deleted'] and \ 'library' in rep and 'biosample' in rep['library']: biosample_object = rep['library']['biosample'] if 'biosample_term_name' in biosample_object: biosample_name_set.add( biosample_object['biosample_term_name']) if 'donor' in biosample_object: donor_set.add( biosample_object['donor']['accession']) if 'treatments' in biosample_object: if len(biosample_object['treatments']) == 0: treatments_set.add('untreated') else: treatments_to_add = [] for t in biosample_object['treatments']: treatments_to_add.append( 'treated with ' + t['treatment_term_name']) treatments_set.add(', '.join( sorted(treatments_to_add))) else: treatments_set.add('untreated') if len(treatments_set) > 1: detail = 'Reference Epigenome {} '.format(value['@id']) + \ ' has biosample associated with different tretments {}.'.format(treatments_set) yield AuditFailure( 'multiple biosample treatments in reference epigenome', detail, level='WARNING') if len(biosample_name_set) > 1: detail = 'Reference Epigenome {} '.format(value['@id']) + \ ' has multiple biosample term names {}.'.format(biosample_name_set) yield AuditFailure( 'multiple biosample term names in reference epigenome', detail, level='WARNING') if len(donor_set) > 1: detail = 'Reference Epigenome {} '.format(value['@id']) + \ ' has multiple donors {}.'.format(donor_set) yield AuditFailure('multiple donors in reference epigenome', detail, level='WARNING') return
def audit_biosample_term(value, system): ''' Biosample_term_id and biosample_term_name and biosample_type should all be present. This should be handled by schemas. Biosample_term_id should be in the ontology. Biosample_term_name should match biosample_term_id. ''' if value['status'] in ['deleted']: return if 'biosample_term_id' not in value: return ontology = system['registry']['ontology'] term_id = value['biosample_term_id'] term_name = value.get('biosample_term_name') if term_id.startswith('NTR:'): detail = 'Biosample {} has a New Term Request {} - {}'.format( value['@id'], term_id, term_name) yield AuditFailure('NTR biosample', detail, level='INTERNAL_ACTION') return biosample_prefix = term_id.split(':')[0] if biosample_prefix not in biosampleType_ontologyPrefix[ value['biosample_type']]: detail = 'Biosample {} of '.format(value['@id']) + \ 'type {} '.format(value['biosample_type']) + \ 'has biosample_term_id {} '.format(value['biosample_term_id']) + \ 'that is not one of ' + \ '{}'.format(biosampleType_ontologyPrefix[value['biosample_type']]) yield AuditFailure('biosample term-type mismatch', detail, level='INTERNAL_ACTION') return if term_id not in ontology: detail = 'Biosample {} has biosample_term_id of {} which is not in ontology'.format( value['@id'], term_id) yield AuditFailure('term_id not in ontology', term_id, level='INTERNAL_ACTION') return ontology_term_name = ontology[term_id]['name'] if ontology_term_name != term_name and term_name not in ontology[term_id][ 'synonyms']: detail = 'Biosample {} has '.format(value['@id']) + \ 'a mismatch between biosample term_id ({}) '.format(term_id) + \ 'and term_name ({}), ontology term_name for term_id {} '.format( term_name, term_id) + \ 'is {}.'.format(ontology_term_name) yield AuditFailure('inconsistent ontology term', detail, level='ERROR') return
def audit_antibody_characterization_target(value, system): ''' Make sure that target in characterization matches target of antibody ''' antibody = value['characterizes'] target = value['target'] if 'recombinant protein' in target['investigated_as']: prefix = target['label'].split('-')[0] unique_antibody_target = set() unique_investigated_as = set() for antibody_target in antibody['targets']: label = antibody_target['label'] unique_antibody_target.add(label) for investigated_as in antibody_target['investigated_as']: unique_investigated_as.add(investigated_as) if ('tag' not in unique_investigated_as and 'synthetic tag' not in unique_investigated_as): detail = ( 'Antibody {} is not for a tagged protein, yet target {} in antibody characterization {} is investigated_as a recombinant protein.' .format( audit_link(path_to_text(antibody['@id']), antibody['@id']), prefix, audit_link(path_to_text(value['@id']), value['@id']), )) raise AuditFailure('not tagged antibody', detail, level='ERROR') else: if prefix not in unique_antibody_target: detail = ( '{} is not found in target list in antibody characterization {} for antibody {}' .format( prefix.capitalize(), audit_link(path_to_text(value['@id']), value['@id']), audit_link(path_to_text(antibody['@id']), antibody['@id']))) raise AuditFailure('mismatched tag target', detail, level='ERROR') else: target_matches = False antibody_targets = [] for antibody_target in antibody['targets']: antibody_targets.append(antibody_target.get('name')) if target['name'] == antibody_target.get('name'): target_matches = True if not target_matches: antibody_targets_string = str(antibody_targets).replace('\'', '') detail = ( 'Antibody characterization {} target is {}, but it could not be found in antibody\'s {} target list {}.' .format( audit_link(path_to_text(value['@id']), value['@id']), target['name'], audit_link(path_to_text(antibody['@id']), antibody['@id']), antibody_targets_string)) raise AuditFailure('inconsistent target', detail, level='ERROR')
def audit_paired_with(value, system): ''' A file with a paired_end needs a paired_with. Should be handled in the schema. A paired_with should be the same replicate ''' if 'paired_end' not in value: return if value['paired_end'] in ['1,2']: return if 'paired_with' not in value: return if 'replicate' not in value['paired_with']: return if 'replicate' not in value: detail = 'File {} has paired_end = {}. It requires a replicate'.format( value['@id'], value['paired_end']) yield AuditFailure('missing replicate', detail, level='INTERNAL_ACTION') elif value['replicate'].get('@id') != value['paired_with']['replicate']: detail = 'File {} has replicate {}. It is paired_with file {} with replicate {}'.format( value['@id'], value['replicate'].get('@id'), value['paired_with']['@id'], value['paired_with'].get('replicate')) yield AuditFailure('inconsistent paired_with', detail, level='ERROR') if value['paired_end'] == '1': context = system['context'] paired_with = context.get_rev_links('paired_with') if len(paired_with) > 1: detail = 'Paired end 1 file {} paired_with by multiple paired end 2 files: {!r}'.format( value['@id'], paired_with ) yield AuditFailure('multiple paired_with', detail, level='ERROR') return file_read_count = value.get('read_count') paired_with_read_count = value['paired_with'].get('read_count') if (file_read_count and paired_with_read_count) and (file_read_count != paired_with_read_count): detail = ('File {} has {} reads. It is' ' paired_with file {} that has {} reads').format( value['@id'], file_read_count, value['paired_with']['@id'], paired_with_read_count) yield AuditFailure('inconsistent read count', detail, level='ERROR')
def audit_item_status(value, system): if 'status' not in value: return level = STATUS_LEVEL.get(value['status'], 50) if level == 0: return if value['status'] in ['revoked', 'archived']: level += 50 context = system['context'] request = system['request'] linked = set() for schema_path in context.type_info.schema_links: if schema_path in [ 'supersedes', 'step_run', 'derived_from', 'controlled_by', 'possible_controls' ]: continue else: linked.update(simple_path_ids(value, schema_path)) for path in linked: linked_value = request.embed(path + '@@object') if 'status' not in linked_value: continue if linked_value['status'] == 'disabled': continue if ( # Special case: A revoked file can have a deleted replicate ticket #2938 'File' in value['@type'] and value['status'] == 'revoked' and 'Replicate' in linked_value['@type'] and linked_value['status'] == 'deleted'): continue linked_level = STATUS_LEVEL.get(linked_value['status'], 50) if linked_value['status'] in ['revoked', 'archived']: linked_level += 50 if linked_level == 0: detail = '{} {} has {} subobject {}'.format( value['status'], value['@id'], linked_value['status'], linked_value['@id']) yield AuditFailure('mismatched status', detail, level='INTERNAL_ACTION') elif linked_level < level: detail = '{} {} has {} subobject {}'.format( value['status'], value['@id'], linked_value['status'], linked_value['@id']) yield AuditFailure('mismatched status', detail, level='INTERNAL_ACTION')
def audit_biosample_modifications(value, system): if value['biosample_ontology']['classification'] == 'whole organisms': model_modifications_present = True model_modifications_ids = set() modifications_ids = set() if 'model_organism_donor_modifications' in value: for model_modification in value[ 'model_organism_donor_modifications']: model_modifications_ids.add(model_modification) else: model_modifications_present = False if 'genetic_modifications' in value: for modification in value['genetic_modifications']: modifications_ids.add(modification) modification_difference = modifications_ids - model_modifications_ids if modification_difference and model_modifications_present: mod_diff_links = [ audit_link(path_to_text(m), m) for m in modification_difference ] model_mod_links = [ audit_link(path_to_text(n), n) for n in model_modifications_ids ] detail = ( 'Biosample {} contains ' ' genetic modificatons {} that are not present' ' in the list of genetic modifications {} of the corresponding strain.' .format(audit_link(path_to_text(value['@id']), value['@id']), ', '.join(mod_diff_links), ', '.join(model_mod_links))) yield AuditFailure('mismatched genetic modifications', detail, level='INTERNAL_ACTION') modification_duplicates = model_modifications_ids & modifications_ids mod_dup_links = [ audit_link(path_to_text(d), d) for d in modification_duplicates ] model_mod_links = [ audit_link(path_to_text(n), n) for n in model_modifications_ids ] if modification_duplicates: detail = ('Biosample {} contains ' 'genetic modifications {} that ' 'are duplicates of genetic modifications {} ' 'of the corresponding strain.'.format( audit_link(path_to_text(value['@id']), value['@id']), ', '.join(mod_dup_links), ', '.join(model_mod_links))) yield AuditFailure('duplicated genetic modifications', detail, level='INTERNAL_ACTION') return
def audit_term(value, system): ''' The classification, term_id and term_name should all be present. This should be handled by schemas. The term_id should be in the ontology. The term_name should match the term_id. ''' if value['status'] in ['deleted']: return ontology = system['registry']['ontology'] term_name = value['term_name'] term_id = value['term_id'] if term_id.startswith('NTR:'): detail = ('BiosampleType {} has a New Term Request {} - {}.'.format( audit_link(path_to_text(value['@id']), value['@id']), term_id, term_name ) ) yield AuditFailure('NTR biosample', detail, level='INTERNAL_ACTION') return if term_id not in ontology: detail = ('BiosampleType {} specifies a term_id {} ' 'that is not part of the {} ontology.'.format( audit_link(path_to_text(value['@id']), value['@id']), term_id, term_id.split(':', 1)[0] ) ) yield AuditFailure('term_id not in ontology', detail, level='INTERNAL_ACTION') return ontology_term_name = ontology[term_id]['name'] if (ontology_term_name != term_name and term_name not in ontology[term_id]['synonyms']): detail = ('BiosampleType {object_id} has a mismatch between' ' term_id ({term_id}) and term_name ({term_name}),' ' ontology term_name for term_id {term_id} is' ' {ontology_term_name}.'.format( object_id=audit_link(path_to_text(value['@id']), value['@id']), term_id=term_id, term_name=term_name, ontology_term_name=ontology_term_name ) ) yield AuditFailure('inconsistent ontology term', detail, level='ERROR')
def audit_read_structure(value, system): read_structure = value.get('read_structure', []) for element in read_structure: if element['start'] == 0 or element['end'] == 0: detail = ('The read_stucture is 1-based. ' 'Neither start or end can be 0 for sequence element {}.'. format(element['sequence_element'])) yield AuditFailure('invalid read_structure', detail, level='ERROR') if element['start'] > element['end']: detail = ('The start coordinate is bigger than the end coordinate ' 'for sequence element {}.'.format( element['sequence_element'])) yield AuditFailure('invalid read_structure', detail, level='ERROR')
def audit_antibody_characterization_review(value, system): ''' Make sure that biosample terms are in ontology for each characterization_review. ''' if (value['status'] in ['not reviewed', 'not submitted for review by lab', 'deleted', 'in progress']): return if 'secondary_characterization_method' in value: return if value['characterization_reviews']: ontology = system['registry']['ontology'] for review in value['characterization_reviews']: term_id = review['biosample_term_id'] term_name = review['biosample_term_name'] term_type = review['biosample_type'] if term_id.startswith('NTR:'): detail = '{} contains a New Term Request {} - {}'.format( value['@id'], term_id, term_name ) yield AuditFailure('NTR biosample', detail, level='INTERNAL_ACTION') return if term_id not in ontology: detail = 'Antibody characterization {} contains '.format(value['@id']) + \ 'a biosample_term_id {} that is not in the ontology'.format(term_id) yield AuditFailure('term_id not in ontology', term_id, level='INTERNAL_ACTION') return ontology_term_name = ontology[term_id]['name'] if ontology_term_name != term_name and term_name not in ontology[term_id]['synonyms']: detail = 'Antibody characterization {} '.format(value['@id']) + \ 'has a mismatched term {} - {} expected {}'.format(term_id, term_name, ontology_term_name) yield AuditFailure('inconsistent ontology term', detail, level='ERROR') return biosample_prefix = term_id.split(':')[0] if biosample_prefix not in biosampleType_ontologyPrefix[review['biosample_type']]: detail = 'Antibody characterization {} is '.format(value['@id']) + \ 'of type {} '.format(term_type) + \ 'and has biosample_term_id {} '.format(term_id) + \ 'that is not one of ' + \ '{}'.format(biosampleType_ontologyPrefix[term_type]) yield AuditFailure('characterization review with biosample term-type mismatch', detail, level='INTERNAL_ACTION') return
def audit_paired_with(value, system): ''' A file with a paired_end needs a paired_with. Should be handled in the schema. A paired_with should be the same replicate ''' if value['status'] in ['deleted', 'replaced', 'revoked']: return if 'paired_end' not in value: return if 'paired_with' not in value: paired_number = "2" if value['paired_end'] == "2": paired_number = "1" detail = 'Sequencing read{} file {} is the result of a '.format( value['paired_end'], value['@id']) + \ 'paired-end sequencing run according to the submitted metadata. ' + \ 'An association with a read{} file needs to be specified.'.format( paired_number) raise AuditFailure('missing paired_with', detail, level='ERROR') if 'replicate' not in value['paired_with']: return if 'replicate' not in value: detail = 'File {} has paired_end = {}. It requires a replicate'.format( value['@id'], value['paired_end']) raise AuditFailure('missing replicate', detail, level='INTERNAL_ACTION') if value['replicate'] != value['paired_with']['replicate']: detail = 'File {} has replicate {}. It is paired_with file {} with replicate {}'.format( value['@id'], value.get('replicate'), value['paired_with']['@id'], value['paired_with'].get('replicate')) raise AuditFailure('inconsistent paired_with', detail, level='ERROR') if value['paired_end'] == '1': context = system['context'] paired_with = context.get_rev_links('paired_with') if len(paired_with) > 1: detail = 'Paired end 1 file {} paired_with by multiple paired end 2 files: {!r}'.format( value['@id'], paired_with, ) raise AuditFailure('multiple paired_with', detail, level='ERROR')
def audit_biosample_donor(value, system): ''' A biosample should have a donor. The organism of donor and biosample should match. ''' if value['status'] in ['deleted']: return if 'donor' not in value: detail = 'Biosample {} is not associated with any donor.'.format( value['@id']) if 'award' in value and 'rfa' in value['award'] and \ value['award']['rfa'] == 'GGR': yield AuditFailure('missing donor', detail, level='INTERNAL_ACTION') return else: yield AuditFailure('missing donor', detail, level='ERROR') return donor = value['donor'] if value.get('organism') != donor.get('organism'): detail = 'Biosample {} is organism {}, yet its donor {} is organism {}. Biosamples require a donor of the same species'.format( value['@id'], value.get('organism'), donor['@id'], donor.get('organism')) yield AuditFailure('inconsistent organism', detail, level='ERROR') if 'mutated_gene' not in donor: return if value.get('organism') != donor['mutated_gene'].get('organism'): detail = 'Biosample {} is organism {}, but its donor {} mutated_gene is in {}. Donor mutated_gene should be of the same species as the donor and biosample'.format( value['@id'], value.get('organism'), donor['@id'], donor['mutated_gene'].get('organism')) yield AuditFailure('inconsistent mutated_gene organism', detail, level='ERROR') for i in donor['mutated_gene'].get('investigated_as'): if i in [ 'tag', 'control', 'recombinant protein', 'nucleotide modification', 'other post-translational modification' ]: detail = 'Donor {} has an invalid mutated_gene {}. Donor mutated_genes should not be tags, controls, recombinant proteins or modifications'.format( donor['@id'], donor['mutated_gene'].get('name')) yield AuditFailure('invalid donor mutated_gene', detail, level='ERROR')
def audit_fly_worm_donor_genotype_dbxrefs(value, system): ''' Fly and worm donors need their genotype information and dbxrefs filled out since the genotype will ge part of the biosample summary. ''' if ('FlyDonor' in value['@type']) or ('WormDonor' in value['@type']): if 'genotype' not in value or not value['genotype']: detail = 'Strain {} should have a value '.format(value['@id']) + \ 'specified for genotype.' yield AuditFailure('missing genotype', detail, level='WARNING') if not value['dbxrefs']: detail = 'Strain {} should have one or more ids '.format(value['@id']) + \ 'specified in the dbxrefs array.' yield AuditFailure('missing dbxrefs', detail, level='WARNING')
def audit_file_matching_md5sum(value, system): ''' Files with md5 sums matching other files should be marked with a WARNING audit. If the other files are listed as matching but in fact have different md5 sums, the file should be flagged with an ERROR for incorrect metadata. ''' matching_files = [] checked_statuses = ['released', 'revoked', 'archived', 'in progress'] if 'matching_md5sum' not in value or value.get( 'status') not in checked_statuses: return for file in value.get('matching_md5sum'): if file.get('uuid') == value.get('uuid'): detail = ('File {} is listing itself as having ' 'a matching md5 sum.'.format( audit_link(path_to_text(value['@id']), value['@id']))) yield AuditFailure('inconsistent matching_md5sum', detail, level='ERROR') if file.get('md5sum') != value.get('md5sum'): detail = ( 'File {} is listed as having a matching md5 sum ' 'as file {}, but the files have different md5 sums.'.format( audit_link(path_to_text(file['@id']), file['@id']), audit_link(path_to_text(value['@id']), value['@id']))) yield AuditFailure('inconsistent matching_md5sum', detail, level='ERROR') elif file.get('status') in checked_statuses: matching_files.append(file['@id']) matching_files_links = [ audit_link(path_to_text(file), file) for file in matching_files ] if not matching_files: return elif len(matching_files) > 2: matching_files_joined = 'Files {}, and {}'.format( ', '.join(matching_files_links[:-1]), matching_files_links[-1]) else: matching_files_joined = ' and '.join(matching_files_links) detail = ('The md5 sum of file {} ' 'matches that of file(s) {}.'.format( audit_link(path_to_text(value['@id']), value['@id']), matching_files_joined)) yield AuditFailure('matching md5 sums', detail, level='WARNING')
def audit_antibody_characterization_review(value, system): ''' Make sure that biosample terms are in ontology for each characterization_review. ''' if (value['status'] in [ 'not reviewed', 'not submitted for review by lab', 'deleted', 'in progress' ]): return if 'secondary_characterization_method' in value: return if value['characterization_reviews']: ontology = system['registry']['ontology'] for review in value['characterization_reviews']: term_id = review['biosample_term_id'] term_name = review['biosample_term_name'] if term_id.startswith('NTR:'): detail = '{} contains a New Term Request {} - {}'.format( value['@id'], term_id, term_name) yield AuditFailure('NTR biosample', detail, level='INTERNAL_ACTION') return if term_id not in ontology: detail = 'Antibody characterization {} contains '.format(value['@id']) + \ 'a biosample_term_id {} that is not in the ontology'.format(term_id) yield AuditFailure('term_id not in ontology', term_id, level='INTERNAL_ACTION') return ontology_term_name = ontology[term_id]['name'] if ontology_term_name != term_name and term_name not in ontology[ term_id]['synonyms']: detail = 'Antibody characterization {} '.format(value['@id']) + \ 'has a mismatch between biosample term_id ({}) '.format( term_id) + \ 'and term_name ({}), ontology term_name for term_id {} '.format( term_name, term_id) + \ 'is {}.'.format(ontology_term_name) yield AuditFailure('inconsistent ontology term', detail, level='ERROR') return
def audit_biosample_part_of_consistency(value, system): if 'part_of' not in value: return else: part_of_biosample = value['part_of'] term_id = value['biosample_term_id'] part_of_term_id = part_of_biosample['biosample_term_id'] if 'biosample_term_name' in value: term_name = value['biosample_term_name'] else: term_name = term_id if 'biosample_term_name' in part_of_biosample: part_of_term_name = part_of_biosample['biosample_term_name'] else: part_of_term_name = part_of_term_id if term_id == part_of_term_id or part_of_term_id == 'UBERON:0000468': return ontology = system['registry']['ontology'] if (term_id in ontology) and (part_of_term_id in ontology): if is_part_of(term_id, part_of_term_id, ontology) is True: return detail = 'Biosample {} '.format(value['@id']) + \ 'with biosample term {} '.format(term_name) + \ 'was separated from biosample {} '.format(part_of_biosample['@id']) + \ 'with biosample term {}. '.format(part_of_term_name) + \ 'The {} '.format(term_id) + \ 'ontology does not note that part_of relationship.' yield AuditFailure('inconsistent biosample_term_id', detail, level='INTERNAL_ACTION') return
def audit_antibody_characterization_unique_reviews(value, system): ''' Make sure primary characterizations have unique lane, biosample_term_id and organism combinations for characterization reviews ''' if (value['status'] in [ 'deleted', 'not submitted for review by lab', 'in progress', 'not reviewed' ]): return if 'secondary_characterization_method' in value: return unique_reviews = set() for review in value['characterization_reviews']: lane = review['lane'] term_id = review['biosample_term_id'] organism = review['organism'] review_lane = frozenset([lane, term_id, organism]) if review_lane not in unique_reviews: unique_reviews.add(review_lane) else: detail = 'Lane {} in {} is a duplicate review for {} - {}'.format( lane, value['@id'], term_id, organism) raise AuditFailure('duplicate lane review', detail, level='INTERNAL_ACTION')
def audit_file_replicate_match(value, system): ''' A file's replicate should belong to the same experiment that the file does. These tend to get confused when replacing objects. ''' if value['status'] in ['deleted', 'replaced', 'revoked']: return if 'replicate' not in value: return rep_exp = value['replicate']['experiment']['uuid'] file_exp = value['dataset']['uuid'] if rep_exp != file_exp: detail = 'File {} from experiment {} '.format(value['@id'], value['dataset']['@id']) + \ 'is associated with replicate [{},{}] '.format( value['replicate']['biological_replicate_number'], value['replicate']['technical_replicate_number']) + \ '{}, but that replicate is associated with a different '.format( value['replicate']['@id']) + \ 'experiment {}.'.format(value['replicate']['experiment']['@id']) yield AuditFailure('inconsistent replicate', detail, level='ERROR') return
def audit_status_replicate(value, system): ''' As the experiment-replicate relationship is reverse calculated, the status checker for item is not sufficient to catch all cases of status mismatch between replicates and experiments. * in-progress replicate can't have experiment in [proposed, released, deleted, revoked] * released or revoked replicate must be in [released or revoked] * if experiment is deleted, replicate must be deleted ''' rep_status = value['status'] exp_status = value['experiment']['status'] if ((rep_status in ['in progress'] and exp_status in ['released', 'revoked', 'proposed', 'preliminary']) or (rep_status in ['released', 'revoked'] and exp_status not in ['released', 'revoked']) or (exp_status in ['deleted'] and rep_status not in ['deleted'])): # If any of the three cases exist, there is an error detail = '{} replicate {} is in {} experiment'.format( rep_status, value['@id'], exp_status ) raise AuditFailure('mismatched status', detail, level='INTERNAL_ACTION')
def audit_biosample_part_of_consistency(value, system): if 'part_of' not in value: return else: part_of_biosample = value['part_of'] term_id = value['biosample_ontology']['term_id'] term_name = value['biosample_ontology']['term_name'] part_of_term_id = part_of_biosample['biosample_ontology']['term_id'] part_of_term_name = part_of_biosample['biosample_ontology'][ 'term_name'] if term_id == part_of_term_id or part_of_term_id == 'UBERON:0000468': return ontology = system['registry']['ontology'] if (term_id in ontology) and (part_of_term_id in ontology): if is_part_of(term_id, part_of_term_id, ontology) is True: return detail = ('Biosample {} with biosample term {} ' 'was separated from biosample {} ' 'with biosample term {}. The {} ' 'ontology does not note that part_of relationship.'.format( audit_link(path_to_text(value['@id']), value['@id']), term_name, audit_link(path_to_text(part_of_biosample['@id']), part_of_biosample['@id']), part_of_term_name, term_id)) yield AuditFailure('inconsistent BiosampleType term', detail, level='INTERNAL_ACTION') return
def audit_duplicate_quality_metrics(value, system): quality_metrics = value.get('quality_metrics') if not quality_metrics: return metric_signatures = [] audit_signatures = [] for metric in quality_metrics: metric_type = metric.get('@type', [None])[0] signature = ( metric_type, metric.get('processing_stage') ) if signature not in metric_signatures: metric_signatures.append(signature) elif signature not in audit_signatures: # Add so only yields audit once per signature per file. audit_signatures.append(signature) detail = ('File {} has more than one {} quality metric.'.format( audit_link(path_to_text(value.get('@id')), value.get('@id')), metric_type ) ) yield AuditFailure( 'duplicate quality metric', detail, level='INTERNAL_ACTION' )
def audit_treatment_concentration_series_mixed_units(value, system): if value['status'] in ['deleted', 'replaced', 'revoked']: return if 'related_datasets' not in value: return treatment_amount_units = set() for assay in value['related_datasets']: if assay['status'] not in ['deleted', 'replaced', 'revoked']: if 'replicates' in assay: for rep in assay['replicates']: if rep['status'] not in ['deleted'] and \ 'library' in rep and 'biosample' in rep['library']: biosample_object = rep['library']['biosample'] if 'treatments' in biosample_object: if len(biosample_object['treatments']) != 0: for t in biosample_object['treatments']: treatment_amount_units.add(t['amount_units']) if len(treatment_amount_units) > 1: detail = (f"Treatments associated with series {audit_link(path_to_text(value['@id']), value['@id'])} " f"use inconsistent amount units {treatment_amount_units}." ) yield AuditFailure('inconsistent treatment units', detail, level='INTERNAL_ACTION') return
def audit_file_processed_derived_from(value): if value['output_category'] in ['raw data', 'reference']: return if 'derived_from' not in value or \ 'derived_from' in value and len(value['derived_from']) == 0: detail = 'derived_from is a list of files that were used to create a given file; ' + \ 'for example, fastq file(s) will appear in the derived_from list of an alignments file. ' + \ 'Processed file {} '.format(value['@id']) + \ 'is missing the requisite file specification in its derived_from list.' yield AuditFailure('missing derived_from', detail, level='INTERNAL_ACTION') return if value['file_format'] != 'bam': return derived_from_files = value.get('derived_from') fastq_bam_counter = 0 for f in derived_from_files: if (f['file_format'] == 'bam' or f['file_format'] == 'fastq' or (f['file_format'] == 'fasta' and f['output_type'] == 'reads' and f['output_category'] == 'raw data')): if f['status'] not in ['deleted', 'replaced', 'revoked'] or \ f['status'] == value['status']: fastq_bam_counter += 1 if f['dataset'] != value['dataset'].get('@id'): detail = 'derived_from is a list of files that were used to create a given file; ' + \ 'for example, fastq file(s) will appear in the derived_from list of an alignments file. ' + \ 'Alignments file {} '.format(value['@id']) + \ 'from experiment {} '.format(value['dataset']) + \ 'specifies a file {} '.format(f['@id']) + \ 'from a different experiment {} '.format(f['dataset']) + \ 'in its derived_from list.' yield AuditFailure('inconsistent derived_from', detail, level='INTERNAL_ACTION') if fastq_bam_counter == 0: detail = 'derived_from is a list of files that were used to create a given file; ' + \ 'for example, fastq file(s) will appear in the derived_from list of an alignments file. ' + \ 'Alignments file {} '.format(value['@id']) + \ 'is missing the requisite file specification in its derived_from list.' yield AuditFailure('missing derived_from', detail, level='INTERNAL_ACTION')
def audit_file_size(value, system): if value['status'] in ['deleted', 'replaced', 'uploading', 'revoked']: return if 'file_size' not in value: detail = 'File {} requires a value for file_size'.format(value['@id']) raise AuditFailure('missing file_size', detail, level='INTERNAL_ACTION')
def audit_biosample_transfection_type(value, system): ''' A biosample with constructs or rnais should have a transfection_type ''' if value['status'] == 'deleted': return if (value['rnais']) and ('transfection_type' not in value): detail = 'Biosample {} with a value for RNAi requires transfection_type'.format( value['@id']) raise AuditFailure('missing transfection_type', detail, level='ERROR') if (value['constructs']) and ('transfection_type' not in value): detail = 'Biosample {} with a value for construct requires transfection_type'.format( value['@id']) raise AuditFailure('missing transfection_type', detail, level='ERROR')
def audit_antibody_characterization_status(value, system): ''' Make sure the lane_status matches the characterization status ''' if (value['status'] in [ 'not reviewed', 'not submitted for review by lab', 'deleted', 'in progress' ]): return if 'secondary_characterization_method' in value: return '''Check each of the lane_statuses in characterization_reviews for an appropriate match''' has_compliant_lane = False is_pending = False if value['status'] == 'pending dcc review': is_pending = True for lane in value['characterization_reviews']: if (is_pending and lane['lane_status'] != 'pending dcc review') or ( not is_pending and lane['lane_status'] == 'pending dcc review'): detail = ( 'A lane.status of {} in antibody characterization {} is incompatible with antibody_characterization.status of {}.' .format(lane['lane_status'], audit_link(path_to_text(value['@id']), value['@id']), value['status'])) raise AuditFailure('mismatched lane status', detail, level='INTERNAL_ACTION') continue if lane['lane_status'] == 'compliant': has_compliant_lane = True if has_compliant_lane and value['status'] != 'compliant': detail = ( 'A lane.status of {} in antibody characterization {} is incompatible with antibody_characterization status of {}.' .format(lane['lane_status'], audit_link(path_to_text(value['@id']), value['@id']), value['status'])) raise AuditFailure('mismatched lane status', detail, level='INTERNAL_ACTION')
def audit_file_format_specifications(value, system): for doc in value.get('file_format_specifications', []): if doc['document_type'] != "file format specification": detail = 'File {} has document {} not of type file format specification'.format( value['@id'], doc['@id']) raise AuditFailure('inconsistent document_type', detail, level='ERROR')
def audit_file_md5sum_integrity(value, system): if value['status'] in ['deleted', 'replaced', 'revoked']: return md5sum = value['md5sum'] try: hexval = int(md5sum, 16) if len(md5sum) != 32: detail = 'File {} '.format(value['@id']) + \ 'has an md5sum value of {}, '.format(md5sum) + \ 'which is not 32 characters long.' yield AuditFailure('inconsistent md5sum', detail, level='INTERNAL_ACTION') except ValueError: detail = 'File {} '.format(value['@id']) + \ 'has an md5sum value of {}, '.format(md5sum) + \ 'which is not a valid hexadecimal number.' yield AuditFailure('inconsistent md5sum', detail, level='INTERNAL_ACTION')