def __link_entity_accessions(submission: Submission, entity: Entity): run = submission.map('experiment_run', 'ABC1234', {}) run.add_accession('ENA', 'ABC1234') for index in [123, 456, 789]: new_entity = submission.map('sample', str(index), {}) new_entity.add_accession('BioSamples', f'SAME{index}') entity.add_link('sample', new_entity.identifier.index)
def test_missmatched_checksum_should_log_error(self, mock: MagicMock): # Given secure_key = 'uuid' entity_type = 'run_experiment' index = f'{entity_type}1' file_name = 'file_name.extension1.ex2' expected_checksum = 'checksum' wrong_checksum = 'not-checksum' mock.return_value = f"{file_name},{expected_checksum}" # When validator = UploadValidator(secure_key) attributes = { 'uploaded_file_1': file_name, 'uploaded_file_1_checksum': wrong_checksum } entity = Entity(entity_type, index, attributes) validator.validate_entity(entity) # Then expected_errors = { 'uploaded_file_1_checksum': [ f'The checksum found on drag-and-drop {expected_checksum} does not match: {wrong_checksum}' ] } self.assertDictEqual(expected_errors, entity.get_errors())
def add_errors(self, schema, ena_type: str, entity_type: str, entity: Entity): for error in schema.error_log: match = self.regex.match(error.message) error_message = match.group('error') if match else error.message error_attribute = error.path.rpartition('/')[2].lower() if error_attribute not in entity.attributes: error_attribute = f'{entity_type}_ena_{ena_type}_accession'.lower() error_message = f'{error.path} {error_message}' entity.add_error(error_attribute, error_message)
def validate_entity(self, entity: Entity): # identify which attribute cases the error attribute = 'fake_attribute' error_msg = 'Example error message' entity.add_error(attribute, error_msg) # or if multiple errors occur for the same attribute error_msgs = ['error 1', 'error 2'] entity.add_errors(attribute, error_msgs) raise NotImplementedError('Example validate entity used')
def __add_errors_to_entity(entity: Entity, schema_errors: dict): for schema_error in schema_errors: attribute_name = str(schema_error['dataPath']).strip('.') stripped_errors = [] for error in schema_error['errors']: error.replace('"', '\'') if error == 'should NOT be valid': error = JsonValidator.__improve_not_be_valid_error_message( entity.identifier.entity_type, attribute_name) if error != 'should match some schema in anyOf': stripped_errors.append(error) entity.add_errors(attribute_name, stripped_errors)
def test_valid_sample_tax_id_should_not_return_error(self): # Given sample_attributes = { 'tax_id': '2697049' } self.taxonomy_validator.ena_taxonomy.validate_tax_id = MagicMock(return_value=self.valid_sarscov2) sample = Entity('sample', 'sample1', sample_attributes) # When self.taxonomy_validator.validate_entity(sample) # Then self.assertFalse(sample.has_errors()) self.assertDictEqual({}, sample.get_errors())
def test_valid_sample_name_should_not_return_error(self): # Given sample_attributes = { 'scientific_name': 'Severe acute respiratory syndrome coronavirus 2' } self.taxonomy_validator.ena_taxonomy.validate_scientific_name = MagicMock(return_value=self.valid_sarscov2) sample = Entity('sample', 'sample1', sample_attributes) # When self.taxonomy_validator.validate_entity(sample) # Then self.assertFalse(sample.has_errors()) self.assertDictEqual({}, sample.get_errors())
def add_entity_accessions(entity: Entity, ignore: List[str]): prefix = f'{entity.identifier.entity_type}_' suffix = '_accession' attribute: str for attribute in entity.attributes.keys(): if (attribute not in ignore and attribute.startswith(prefix) and attribute.endswith(suffix)): service_name = attribute[len(prefix):len(attribute) - len(suffix)] if service_name in SERVICE_NAMES: service_name = SERVICE_NAMES[service_name] if service_name and entity.attributes[attribute]: entity.add_accession(service_name, entity.attributes[attribute])
def test_invalid_tax_id_should_return_error(self): # Given sample_attributes = {'tax_id': '999999999999'} error = 'Not valid tax_id: 999999999999.' expected_error = { 'tax_id': [error] } self.taxonomy_validator.ena_taxonomy.validate_tax_id = MagicMock(return_value={'error': error}) sample = Entity('sample', 'sample1', sample_attributes) # When self.taxonomy_validator.validate_entity(sample) # Then self.assertTrue(sample.has_errors()) self.assertDictEqual(expected_error, sample.get_errors())
def get_linked_accessions(self, entity: Entity) -> Dict[str, Set[str]]: accessions: Dict[str, Set[str]] = {} for entity_type in self.get_entity_types(): for entity in self.get_linked_entities(entity, entity_type): for service, accession in entity.get_accessions(): accessions.setdefault(service, set()).add(accession) return accessions
def test_invalid_name_should_return_error(self): sample_attributes = {'scientific_name': 'Lorem Ipsum'} error = 'Not valid scientific_name: Lorem Ipsum.' expected_error = { 'scientific_name': [error] } self.taxonomy_validator.ena_taxonomy.validate_scientific_name = MagicMock(return_value={'error': error}) sample = Entity('sample', 'sample1', sample_attributes) # When self.taxonomy_validator.validate_entity(sample) # Then self.assertTrue(sample.has_errors()) self.assertDictEqual(expected_error, sample.get_errors())
def map(self, entity_type: str, index: str, attributes: dict) -> Entity: if entity_type in self.__map and index in self.__map[entity_type]: entity = self.__handle_collision(entity_type, index, attributes) else: entity = Entity(entity_type, index, attributes) self.__map.setdefault(entity_type, {})[index] = entity return entity
def test_missing_file_should_log_error(self, mock: MagicMock): # Given entity_type = 'run_experiment' index = f'{entity_type}1' mock.return_value = f"file_name.extension1.ex2,checksum" validator = UploadValidator('uuid') entity = Entity(entity_type, index, {'uploaded_file_1': 'missing.file'}) expected_errors = { 'uploaded_file_1': ['File has not been uploaded to drag-and-drop: missing.file'] } # When validator.validate_entity(entity) # Then self.assertDictEqual(expected_errors, entity.get_errors())
def validate_entity(self, entity: Entity): sample = entity.attributes sample_errors = {} if 'tax_id' in sample and 'scientific_name' in sample: tax_response = self.ena_taxonomy.validate_taxonomy( tax_id=sample['tax_id'], scientific_name=sample['scientific_name'] ) sample_errors = self.get_taxonomy_errors(tax_response) else: if 'tax_id' in sample: tax_response = self.ena_taxonomy.validate_tax_id(sample['tax_id']) sample_errors = self.get_errors(tax_response, 'tax_id') elif 'scientific_name' in sample: tax_response = self.ena_taxonomy.validate_scientific_name(sample['scientific_name']) sample_errors = self.get_errors(tax_response, 'scientific_name') for attribute, errors in sample_errors.items(): entity.add_errors(attribute, errors)
def validate_file(self, entity: Entity, file_attribute: str, check_attribute: str): file_name = entity.attributes[file_attribute] if file_name not in self.file_checksum_map: entity.add_error( file_attribute, f'File has not been uploaded to drag-and-drop: {file_name}') return upload_checksum = self.file_checksum_map[file_name] if check_attribute in entity.attributes: stated_checksum = entity.attributes[check_attribute] if stated_checksum != upload_checksum: entity.add_error( check_attribute, f'The checksum found on drag-and-drop {upload_checksum} does not match: {stated_checksum}' ) return else: entity.attributes[check_attribute] = upload_checksum
def test_validation_with_second_file_present(self, mock: MagicMock): # Given secure_key = 'uuid' entity_type = 'run_experiment' index = f'{entity_type}1' mock.return_value = f"first-file,first-checksum\n" \ f"second-file,second-checksum" # When validator = UploadValidator(secure_key) attributes = { 'uploaded_file_1': 'first-file', 'uploaded_file_1_checksum': 'first-checksum', 'uploaded_file_2': 'second-file', 'uploaded_file_2_checksum': 'second-checksum', } entity = Entity(entity_type, index, attributes) validator.validate_entity(entity) # Then self.assertDictEqual({}, entity.get_errors())
def test_inconsistent_sample_should_return_error(self): # Given sample_attributes = { 'scientific_name': 'Severe acute respiratory syndrome coronavirus 2', 'tax_id': '9606' } self.taxonomy_validator.ena_taxonomy.validate_scientific_name = MagicMock(return_value=self.valid_sarscov2) self.taxonomy_validator.ena_taxonomy.validate_tax_id = MagicMock(return_value=self.valid_human) consistent_error = 'Information is not consistent between taxId: 9606 and scientificName: Severe acute respiratory syndrome coronavirus 2' expected_errors = { 'scientific_name': [consistent_error], 'tax_id': [consistent_error] } sample = Entity('sample', 'sample1', sample_attributes) # When self.taxonomy_validator.validate_entity(sample) # Then self.assertTrue(sample.has_errors()) self.assertDictEqual(expected_errors, sample.get_errors())
def test_validation_with_second_file_missing(self, mock: MagicMock): # Given secure_key = 'uuid' entity_type = 'run_experiment' index = f'{entity_type}1' mock.return_value = f"first-file,first-checksum" # When validator = UploadValidator(secure_key) attributes = { 'uploaded_file_1': 'first-file', 'uploaded_file_2': 'second-file' } entity = Entity(entity_type, index, attributes) validator.validate_entity(entity) # Then expected_errors = { 'uploaded_file_2': ['File has not been uploaded to drag-and-drop: second-file'] } self.assertDictEqual(expected_errors, entity.get_errors())
def test_invalid_sample_name_should_return_error(self): # Given sample_attributes = { 'scientific_name': 'Lorem Ipsum', 'tax_id': '2697049' } error = 'Not valid scientific_name: Lorem Ipsum.' consistent_error = 'Information is not consistent between taxId: 2697049 and scientificName: Lorem Ipsum' expected_errors = { 'scientific_name': [error, consistent_error], 'tax_id': [consistent_error] } self.taxonomy_validator.ena_taxonomy.validate_scientific_name = MagicMock(return_value={'error': error}) self.taxonomy_validator.ena_taxonomy.validate_tax_id = MagicMock(return_value=self.valid_sarscov2) sample = Entity('sample', 'sample1', sample_attributes) # When self.taxonomy_validator.validate_entity(sample) # Then self.assertTrue(sample.has_errors()) self.assertDictEqual(expected_errors, sample.get_errors())
def convert_sample(self, sample_entity: Entity) -> Sample: sample = Sample( accession=sample_entity.get_accession('BioSamples'), name=self.named_attribute(sample_entity, 'sample_title'), domain=self.named_attribute(sample_entity, 'domain', self.domain), ncbi_taxon_id=self.named_attribute(sample_entity, 'tax_id'), species=self.named_attribute(sample_entity, 'scientific_name')) sample._append_organism_attribute() for name, value in sample_entity.attributes.items(): if name not in REMOVE_KEYS: sample.attributes.append( Attribute(name=name.replace('_', ' '), value=value, unit=self.unit_map.get(name, None))) return sample
def convert_experiment(converter: EnaExperimentConverter, data: Submission, experiment: Entity) -> Element: samples = data.get_linked_entities(experiment, 'sample') studies = data.get_linked_entities(experiment, 'study') if len(samples) < 1 or len(studies) < 1: if len(samples) < 1: experiment.add_error('run_experiment_ena_experiment_accession', 'No Linked Sample') if len(studies) < 1: experiment.add_error('run_experiment_ena_experiment_accession', 'No Linked Study') else: len_samples = len(samples) len_studies = len(studies) sample = samples.pop() study = studies.pop() # ENA Only supports linking one study & sample to an experiment if len_samples > 1: experiment.add_error('run_experiment_ena_experiment_accession', f'More than one Sample Linked, using first: {sample.identifier.index}') if len_studies > 1: experiment.add_error('run_experiment_ena_experiment_accession', f'More than one Study Linked, using first: {study.identifier.index}') return converter.convert_experiment(experiment, sample, study)
def test_passed_bio_study_entity_returns_correct_json_representative(self): bio_study_attributes = { "study_accession": "PRJEB12345", "study_alias": "SARS-CoV-2 genomes 123ABC alias", "email_address": "*****@*****.**", "center_name": "EBI", 'study_name': 'SARS-CoV-2 genomes 123ABC name', "short_description": "test short description", "abstract": "test abstract", "release_date": "2020-08-21" } bio_study_entity = Entity(entity_type="study", index=bio_study_attributes["study_alias"], attributes=bio_study_attributes) expected_payload = self.__get_expected_payload(bio_study_entity) bio_study_json_payload = BioStudyConverter.convert_study( bio_study_entity) self.assertDictEqual(expected_payload, bio_study_json_payload)
def test_validation_should_edit_file_attributes(self, mock: MagicMock): # Given secure_key = 'uuid' entity_type = 'run_experiment' index = f'{entity_type}1' file_name = 'file_name.extension1.ex2' checksum = 'checksum' mock.return_value = f"{file_name},{checksum}" # When validator = UploadValidator(secure_key) attributes = {'uploaded_file_1': file_name} entity = Entity(entity_type, index, attributes) validator.validate_entity(entity) # Then expected_attributes = { 'uploaded_file_1': file_name, 'uploaded_file_1_checksum': checksum, } self.assertDictEqual(expected_attributes, entity.attributes)
def __add_accession(study: Entity, submission_payload: dict): study_accession: str = study.get_accession('BioStudies') if study_accession: submission_payload['accno'] = study_accession return submission_payload
def get_linked_entities(self, entity: Entity, entity_type: str) -> Set[Entity]: entities = set() for index in entity.get_linked_indexes(entity_type): entities.add(self.get_entity(entity_type, index)) return entities
def link_entities(entity_a: Entity, entity_b: Entity): entity_a.add_link_id(entity_b.identifier) entity_b.add_link_id(entity_a.identifier)
def add_link(link: dict, entity: Entity, accession_services: Iterable[str]): accession = entity.get_first_accession(accession_services) if accession: link['@accession'] = ['', fixed_attribute, accession] else: link['@refname'] = ['', fixed_attribute, entity.identifier.index]
def add_alias(self, spec: dict, entity: Entity): accession = entity.get_accession(f'ENA_{self.ena_type}') if accession: spec['@accession'] = ['', fixed_attribute, accession] else: spec['@alias'] = ['', fixed_attribute, entity.identifier.index]
def update_links_in_submission(self, submission: Submission, study: Entity) -> dict: study_accession = study.get_accession('BioStudies') biostudies_submission = self.get_submission_by_accession(study_accession).json links_section = self.__get_links_section_from_submission(biostudies_submission) self.__update_links_section(links_section, study, submission) return biostudies_submission