def organism_part_sex_check(sample: Dict, animal: Dict, existing_results: VRR) -> VRR: """ Context validation to check organism part matches sex, i.e. s***n only from male animal For annotated with unknown sex, a Warning will be raised :param sample: the sample record :param animal: the derived from animal record :param existing_results: the existing validation result :return: the updated validation result """ sex: str = animal['attributes']['Sex'][0]['value'] organism_part_ontology = misc.extract_ontology_id_from_iri( sample['attributes']['Organism part'][0]['terms'][0]['url']) if organism_part_ontology == 'UBERON_0001968': #s***n if sex.lower() == "female": existing_results.add_validation_result_column( VRC( VRConstant.ERROR, "Organism part (S***n) could not be taken from a female animal", existing_results.record_id, "organism part", VRConstant.CONTEXT)) # the third sex opiton is 'record of unknown sex' elif 'unknown sex' in sex.lower(): existing_results.add_validation_result_column( VRC( VRConstant.WARNING, "Organism part (S***n) is expected to be taken from a male animal, " "please check the sex value (record of unknown sex) is correct", existing_results.record_id, "organism part", VRConstant.CONTEXT)) return existing_results
def validate(self, attributes: Dict, record_id: str, id_field: str) -> List[VRC]: """ Validate the record using all field rules in the section :param attributes: the record attribute values :param record_id: the id of the record :param id_field: the name of the id field :return: list of field validaitn results """ results: List[VRC] = [] # all mandatory fields must be there, not checking details in this step if 'mandatory' in self.rules: mandatory_rules = self.rules['mandatory'] for field_name in mandatory_rules.keys(): if field_name == id_field: continue if field_name not in attributes: msg = f"Mandatory field {field_name} in {self.get_section_name()} section could not be found" results.append( VRC(VRConstants.ERROR, msg, record_id, field_name)) if results: return results # check values for all required levels for required in self.rules.keys(): rules = self.rules[required] for field_name in rules.keys(): if field_name in attributes: one_field_result = rules[field_name].validate( attributes[field_name], self.get_section_name(), record_id) for tmp in one_field_result: results.append(tmp) return results
def load_data(self, data_file: str, section: str = '') -> VRR: """ Load the data from JSON file which is to be validated and do preliminary validation (usi structure and duplicate), if successful set data ready flag The preliminary validation results are stored in the general_errors class field :param data_file: the JSON file contains the data :param section: optional, the name of the section which contains data """ self.data_ready_flag = False general_errors = VRR("general") try: with open(data_file) as infile: self.data = json.load(infile) except FileNotFoundError: msg = f"Could not find the file {data_file}" general_errors.add_validation_result_column( VRC(VRConstants.ERROR, msg, general_errors.record_id, "", VRConstants.GENERAL)) return general_errors except json.decoder.JSONDecodeError: msg = f"The provided file {data_file} is not a valid JSON file." general_errors.add_validation_result_column( VRC(VRConstants.ERROR, msg, general_errors.record_id, "", VRConstants.GENERAL)) return general_errors if len(section) > 0: if section in self.data: self.data = self.data[section] # check usi structure usi_check_result = validation.check_usi_structure(self.data) if usi_check_result.get_overall_status() != "Pass": return usi_check_result # check duplicate id msgs = validation.check_duplicates(self.data, self.id_field) if msgs: for msg in msgs: # classify the error as ruleset based error # as it is implicitly required that id field holds unique values general_errors.add_validation_result_column( VRC(VRConstants.ERROR, msg, general_errors.record_id, self.id_field, VRConstants.RELATIONSHIP)) return general_errors logger.info("All sample records have unique data source ids") self.data_ready_flag = True return general_errors
def context_validation(record: Dict, existing_results: VRR, related: List = None) -> VRR: """ do validation based on context, i.e. value in one field affects allowed values in another field or involve more than one record :param record: the record data :param existing_results: the existing validation result :param related: list of the related records either parents or related animal, could be empty list :return: updated validation result """ existing_results = coordinate_check(record['attributes'], existing_results) existing_results = species_check(record, existing_results) record_id = existing_results.record_id # existing related records, i.e. having relationships if related: material = record['attributes']['Material'][0]['value'] if material == "organism": if len(related) > 2: existing_results.add_validation_result_column( VRC( VRConstant.ERROR, "Having more than 2 parents defined in sampleRelationships", existing_results.record_id, "sampleRelationships", VRConstant.CONTEXT)) else: existing_results = child_of_check(record, related, existing_results) if len(related) == 2: existing_results = parents_sex_check( related, existing_results) else: if len(related) != 1: existing_results.add_validation_result_column( VRC(VRConstant.ERROR, "Specimen can only derive from one animal", record_id, "sampleRelationships", VRConstant.CONTEXT)) else: existing_results = animal_sample_check(record, related[0], existing_results) return existing_results
def species_breed_check(animal: Dict, existing_results: VRR) -> VRR: """ check whether mapped breed (recommended) matches species if mapped breed not found, gives a warning saying no check has been carried out on supplied breed (mandatory) :param animal: the animal record to be validated :param existing_results: the existing validation result :return: the updated validation result """ attrs = animal['attributes'] # get root breed ontology term based on given species species = attrs[SPECIES][0]['value'] general_breed_from_species: str = use_ontology.get_general_breed_by_species( species) general_breed_term = general_breed_from_species['ontologyTerms'].rsplit( "/", 1)[1] if 'Mapped breed' in attrs: mapped_breed = attrs['Mapped breed'][0]['terms'][0]['url'] match = static_parameters.ontology_library.has_parent( mapped_breed, general_breed_term) if not match: general_crossbreed_from_species = use_ontology.get_general_breed_by_species( species, cross=True) general_crossbreed_term = general_crossbreed_from_species[ 'ontologyTerms'].rsplit("/", 1)[1] match = static_parameters.ontology_library.has_parent( mapped_breed, general_crossbreed_term) if not match: existing_results.add_validation_result_column( VRC( VRConstant.ERROR, f"The mapped breed {mapped_breed} does not match the given species {species}", existing_results.record_id, "Mapped breed", VRConstant.CONTEXT)) else: existing_results.add_validation_result_column( VRC( VRConstant.WARNING, f"No check has been carried out on whether " f"{attrs['Supplied breed'][0]['value']} is a {species} breed as no mapped breed provided", existing_results.record_id, "Supplied breed", VRConstant.CONTEXT)) return existing_results
def validate(self, record: Dict, id_field: str = 'Data source ID') -> VRR: """ Validate the record with the full ruleset :param record: the record data :param id_field: the name of the id field, in IMAGE ruleset it is Data source ID :return: the validation result """ logger.debug(f"got record: {record}, id_field: {id_field}") attributes = record['attributes'] record_id = attributes[id_field][0]['value'] record_result = VRR(record_id) unmapped = attributes.copy( ) # create a copy and remove the ruleset-mapped columns del unmapped[id_field] for section_name in self.get_all_section_names(): logger.debug(f"Processing section_name: {section_name}") section_rule = self.get_section_by_name(section_name) # logger.debug("Got section_rule: %s" % (section_rule.toJSON())) if section_rule.meet_condition(record): logger.debug("Applying " + section_name + " ruleset to record " + record_id) section_results = section_rule.validate( attributes, record_id, id_field) for one in section_results: record_result.add_validation_result_column(one) for field_name in section_rule.get_rule_names(): if field_name in unmapped: del unmapped[field_name] else: logger.debug("section_rule %s doesn't meet_condition" % section_name) # unmapped column check can only be done here, not in section rule # validation as all section rules need to apply if unmapped: logger.debug("found those unmapped keys: %s" % (unmapped.keys())) for key in unmapped.keys(): record_result.add_validation_result_column( VRC(VRConstants.WARNING, f"Column {key} could not be found in ruleset", record_id, key)) else: logger.debug("No unmapped columns left") return record_result
def parents_sex_check(related: List[Dict], existing_results: VRR) -> VRR: """ Context validation to check whether the two annotated parents have two different genders For annotated with unknown sex, a Warning will be raised :param related: the list of two parent animals :param existing_results: the existing validation result :return: the updated validation result """ one_sex: str = related[0]['attributes']['Sex'][0]['value'] another_sex: str = related[1]['attributes']['Sex'][0]['value'] unknown_flag = False if "unknown sex" in one_sex.lower() or "unknown sex" in another_sex.lower( ): unknown_flag = True existing_results.add_validation_result_column( VRC( VRConstant.WARNING, "At least one parent has unknown value for sex, thus could not be checked", existing_results.record_id, "parents sex", VRConstant.CONTEXT)) if not unknown_flag and one_sex == another_sex: existing_results.add_validation_result_column( VRC(VRConstant.ERROR, "Two parents could not have same sex", existing_results.record_id, "parents sex", VRConstant.CONTEXT)) return existing_results
def coordinate_check(record: Dict, existing_results: VRR) -> VRR: """ Context validation to check whether value in the place field matches to the value in the accuracy field :param record: the record data :param existing_results: the existing validation result :return: the updated validation result """ if type(record) is not dict: raise TypeError("record needs to be a record represented as a Dict") if type(existing_results) is not VRR: raise TypeError( "The existing results parameter needs to be a ValidationResultRecord object" ) material = record['Material'][0]['value'] if material == "organism": place_field_name = "Birth location" else: place_field_name = "Collection place" place_accuracy_field_name = place_field_name + " accuracy" if place_field_name not in record: if record[place_accuracy_field_name][0][ 'value'] != "missing geographic information": msg = f"No value provided for field {place_field_name} but value in field" \ f" {place_accuracy_field_name} is not missing geographic information" existing_results.add_validation_result_column( VRC(VRConstant.ERROR, msg, existing_results.record_id, place_field_name, VRConstant.CONTEXT)) else: if record[place_accuracy_field_name][0][ 'value'] == "missing geographic information": msg = f"Value {record[place_field_name][0]['value']} provided for field {place_field_name} " \ f"but value in field {place_accuracy_field_name} is missing geographic information" existing_results.add_validation_result_column( VRC(VRConstant.ERROR, msg, existing_results.record_id, place_field_name, VRConstant.CONTEXT)) return existing_results
def species_check(record: Dict, existing_results: VRR) -> VRR: """ Context validation to check when species specified in the USI structure matches the species field :param record: the record data :param existing_results: the existing validation result :return: the updated validation result """ taxon_id = record['taxonId'] url = record['attributes'][SPECIES][0]['terms'][0]['url'] if not url.endswith(str(taxon_id)): existing_results.add_validation_result_column( VRC( VRConstant.ERROR, f"taxonId {taxon_id} does not match ontology term used in species {url}", existing_results.record_id, "taxonomy", VRConstant.CONTEXT)) return existing_results
def check_value_equal(source: Dict, target: Dict, existing_results: VRR, field: str) -> VRR: target_field_value = target['attributes'][field][0]['value'] source_field_value = source['attributes'][field][0]['value'] source_label = 'sample' target_label = 'related animal' if source['attributes']['Material'][0]['value'] == 'organism': source_label = 'child' target_label = 'parent' if target_field_value != source_field_value: record_id = existing_results.record_id existing_results.add_validation_result_column( VRC( VRConstant.ERROR, f"The {field} of {source_label} ({source_field_value}) does not " f"match to the {field} of {target_label} ({target_field_value})", record_id, field, VRConstant.CONTEXT)) return existing_results
def load_ruleset(self, ruleset_file: str) -> VRR: """ Load the ruleset from the JSON file and check the integrity of the ruleset, if successful, set ruleset ready flag if not, the results are stored in the class field general_errors :param ruleset_file: the JSON file containing the ruleset """ self.ruleset_pass_flag = False general_errors = VRR("general") try: self.ruleset = validation.read_in_ruleset(ruleset_file) except KeyError as e: general_errors.add_validation_result_column( VRC(VRConstants.ERROR, str(e), general_errors.record_id, "", VRConstants.GENERAL)) return general_errors ruleset_check_result: VRR = validation.check_ruleset(self.ruleset) if ruleset_check_result.get_overall_status() != "Pass": return ruleset_check_result logger.info("Ruleset loaded") self.ruleset_pass_flag = True return general_errors
def validate(self, entries, section_name: str, record_id: str): """ Validate values of one field against the ruleset for that field :param entries: field data :param section_name: the section the field belong to :param record_id: the id of the record :return: list of validation result represented as validation column result list """ results: List[VRC] = [] section_info: str = " (" + section_name + " section)" mandatory = False if self.required == 'mandatory': mandatory = True has_error = False # check cardinality entry_size: int = len(entries) if entry_size == 0: if mandatory: msg = f"Mandatory field {self.name} has empty value" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) has_error = True else: msg = f"{self.required} field {self.name} has empty value, better remove the field" results.append( VRC(VRConstants.WARNING, msg + section_info, record_id, self.name)) elif entry_size > 1: if not self.allow_multiple(): msg = f"Multiple values supplied for field {self.name} which does not allow multiple values" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) has_error = True # multiple only be True (reaching here) when existing Allow Multiple, no need to check existence if entry_size > 2 and self.get_multiple() == 'max 2': msg = f"Maximum of 2 values allowed for field {self.name} but {str(entry_size)} values provided" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) has_error = True # the errors detected above mean that there is no need to validate the actual value(s) if has_error: return results for entry in entries: value = entry['value'] # check units allowed_units = self.get_allowed_units() allowed_units_str = ', '.join(allowed_units) if 'units' in entry: if allowed_units: if entry['units'] not in allowed_units: msg = f"{entry['units']} for field {self.name} is not " \ f"in the valid units list ({allowed_units_str})" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) else: # unit not required, but exists, raise a warning msg = f"No units required but {entry['units']} is used as unit for field {self.name}" results.append( VRC(VRConstants.WARNING, msg + section_info, record_id, self.name)) else: if allowed_units: msg = f"One of {allowed_units_str} need to be present for the field {self.name}" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) # check allowed values allowed_values = self.get_allowed_values() if allowed_values: if value not in allowed_values: if self.name == "Availability": # available valid values include [email protected] and no longer available, needs to check for email if not misc.is_url(value): msg = f'<{value}> of field Availability is neither "no longer available" nor a valid URI' results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) else: # not availability allowed_values_str = '>, <'.join(allowed_values) msg = f"<{value}> of field {self.name} is not in the valid values list (<{allowed_values_str}>)" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) if results: return results if 'terms' in entry: if not self.get_allowed_terms(): # allowed conditions empty msg = f"Ontology provided for field {self.name} however there is no requirement in the ruleset" results.append( VRC(VRConstants.WARNING, msg + section_info, record_id, self.name)) else: for term in entry['terms']: iri = term['url'] if not misc.is_url(iri): msg = f"Invalid URI value {iri} in field {self.name}" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) continue term_id = misc.extract_ontology_id_from_iri(iri) if not self.check_ontology_allowed(term_id): msg = f"Not valid ontology term {term_id} in field {self.name}" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) if results: return results # check type # current allowed types: # numeric: number # textual: text, limited value, ontology_id, uri, doi, date # number type requires a unit, which is covered in the units check above if self.type == 'number': if type(value) is not float and type(value) is not int: msg = f"For field {self.name} the provided value {str(value)} is not represented " \ f"as/of the expected type Number" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) else: # textual types if type(value) is not str: msg = f"For field {self.name} the provided value {str(value)} " \ f"is not of the expected type {self.type}" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) return results # the following tests are based on the value is a string, so need to return above if self.type == 'ontology_id': if 'terms' not in entry: msg = f"No url found for the field {self.name} which has the type of ontology_id" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) else: for term in entry['terms']: iri = term['url'] term = misc.extract_ontology_id_from_iri(iri) ontology = static_parameters.ontology_library.get_ontology( term) if iri != ontology.get_iri(): msg = f"Provided iri {iri} does not match the iri " \ f"retrieved from OLS in the field {self.name}" results.append( VRC(VRConstants.WARNING, msg + section_info, record_id, self.name)) if not ontology.label_match_ontology(value): if ontology.label_match_ontology(value, False): msg = f"Provided value {value} has different letter case" \ f" to the term referenced by {iri}" results.append( VRC(VRConstants.WARNING, msg + section_info, record_id, self.name)) else: msg = f"Provided value {value} does not match to the provided ontology {iri}" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) elif self.type == "uri": url_result = misc.is_url(value) if not url_result: msg = f"Invalid URI value {value} for field {self.name}" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) else: # is in URI # in image ruleset, when email provided, it must begin with mailto: if misc.is_email(value): if misc.is_email( value, True ): # the whole value of value is an email, which is wrong msg = f'Email address must have prefix "mailto:" in the field {self.name}' results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) else: # it is URL, but not email: could be a normal URL or wrong mailto: location if value.find("mailto:") > 0: msg = f"mailto must be at position 1 to be a valid email value in the field {self.name}" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) elif self.type == 'doi': doi_result = misc.is_doi(value) if not doi_result: msg = f"Invalid DOI value supplied in the field {self.name}" results.append( VRC(VRConstants.ERROR, msg + section_info, record_id, self.name)) elif self.type == 'date': # there is always a format(unit) for the date type (checked in the validation.read_in_ruleset) # therefore entry[units] existence should have already been # if 'units' not in entry: date_format = entry['units'] date_result = misc.get_matched_date(value, date_format) if date_result: results.append( VRC(VRConstants.ERROR, date_result + section_info, record_id, self.name)) # it would be safer to skip the validations below as unmatched type detected if results: return results return results
def validate(self) -> None: """ Validate the data against the ruleset the data needs to be scanned twice, first time to validate individual field in every record based on the ruleset second time to validate anything involving with more than one record e.g. relationships and context validation or more than one fields in the same record The validation result is stored in the object's validation_results field """ # check whether ready to carry out validation if not self.data_ready_flag: logger.error( "The data is not ready, abort the validation proecess") return if not self.ruleset_pass_flag: logger.error( "The ruleset is not ready, abort the validation proecess") return # first scan # meanwhile split data according to material type for relationship checking data_by_material: Dict[str, Dict[str, Dict]] = {} for record in self.data: logger.info("Validate record " + record['alias']) record_result = self.ruleset.validate(record) self.validation_results[record['alias']] = record_result try: material = record['attributes']['Material'][0]['value'].lower() data_by_material.setdefault(material, {}) data_by_material[material][record['alias']] = record except KeyError: # error still exists, e.g. using material rather than Material, which needs to be caught # however the reporting should have already been done by validation pass for record in self.data: # if the record is with status Error, no more validation will be done if self.validation_results[ record['alias']].get_overall_status() == VRConstants.ERROR: continue record_result = self.validation_results[record['alias']] record_id = record['attributes'][self.id_field][0]['value'] # check relationship relationships = record.get('sampleRelationships', []) related: List[Dict] = [] for relationship in relationships: if 'accession' in relationship: # target is biosample accession which is checked in validation.check_usi_structure target = relationship['accession'] url = f"https://www.ebi.ac.uk/biosamples/samples/{target}" response = requests.get(url) status = response.status_code if status != 200: record_result.add_validation_result_column( VRC( VRConstants.WARNING, f"Fail to retrieve record {target} from " f"BioSamples as required in the relationship", record_id, 'sampleRelationships', VRConstants.GENERAL)) else: # at the moment, no any IMAGE data in BioSamples # check project = IMAGE # parse into memory pass else: # in the current ruleset, derived from only from organism to specimen, # so safe to only check organism target: str = relationship['alias'] if target in data_by_material['organism']: related.append( dict(data_by_material['organism'][target])) else: record_result.add_validation_result_column( VRC( VRConstants.ERROR, f"Could not locate the referenced record {target}", record_id, 'sampleRelationships', VRConstants.RELATIONSHIP)) self.validation_results[record['alias']] = record_result # if error found during relationship checking, skip context validation # because some context validation (relationship check etc) could not be carried out if self.validation_results[ record['alias']].get_overall_status() == VRConstants.ERROR: continue record_result = validation.context_validation( record, record_result, related) if record_result.is_empty(): record_result.add_validation_result_column( VRC("Pass", "", record_result.record_id, "", VRConstants.EMPTY)) self.validation_results[record['alias']] = record_result
def check_ruleset(ruleset: Ruleset.RuleSet) -> VRR: """ Validate the ruleset itself is a valid ruleset :param ruleset: the ruleset to be validated :return: the list of errors, if ruleset is valid, the list is empty """ if type(ruleset) is not Ruleset.RuleSet: raise TypeError("The parameter must be of a RuleSet object") # conditions results: VRR = VRR(RULESET_CHECK_ID) for section_name in ruleset.get_all_section_names(): section_rule: Ruleset.RuleSection = ruleset.get_section_by_name( section_name) rules_in_section = section_rule.get_rules() for required in rules_in_section.keys(): for rule_name in rules_in_section[required].keys(): rule = rules_in_section[required][rule_name] rule_type = rule.get_type() if rule.get_allowed_values(): # allowed values provided if rule_type == "ontology_id" or rule_type == "text": msg = f"No valid values should be provided to field " \ f"{rule.get_name()} as being of {rule_type} type" results.add_validation_result_column( VRC(VRConstant.ERROR, msg, RULESET_CHECK_ID, rule_name, VRConstant.RULESET_CHECK)) else: # no allowed values provided if rule_type == "limited value": msg = f"There is no allowed values for field {rule.get_name()} being of {rule_type} type" results.add_validation_result_column( VRC(VRConstant.ERROR, msg, RULESET_CHECK_ID, rule_name, VRConstant.RULESET_CHECK)) if rule.get_allowed_units(): # units provided if rule_type != "number" and rule_type != "date": msg = f"Valid units provided for field {rule.get_name()} " \ f"having type as {rule_type} which does not expect units" results.add_validation_result_column( VRC(VRConstant.ERROR, msg, RULESET_CHECK_ID, rule_name, VRConstant.RULESET_CHECK)) else: # no units provided if rule_type == "number" or rule_type == "date": msg = f"Field {rule.get_name()} has type as {rule_type} but no valid units provided" results.add_validation_result_column( VRC(VRConstant.ERROR, msg, RULESET_CHECK_ID, rule_name, VRConstant.RULESET_CHECK)) if rule.get_allowed_terms(): # ontology terms provided if rule_type != "ontology_id": msg = f"Ontology terms are provided for field {rule.get_name()}. " \ f"Please re-consider whether it needs to change to ontology_id type." results.add_validation_result_column( VRC(VRConstant.WARNING, msg, RULESET_CHECK_ID, rule_name, VRConstant.RULESET_CHECK)) else: # no ontology provided if rule_type == "ontology_id": msg = f"No valid terms provided to field {rule.get_name()} " \ f"which is essential to be of ontology_id type" results.add_validation_result_column( VRC(VRConstant.ERROR, msg, RULESET_CHECK_ID, rule_name, VRConstant.RULESET_CHECK)) return results
def check_usi_structure(sample: List[Dict]) -> VRR: """ Check whether the record values are represented in the format which can be submitted via USI This also guarantees that the following validation does not need to worry about how the data is represented This function also checks whether more than one record use the same alias (USI requirement) :param sample: the records represented in JSON :return: the list of error messages """ logger.debug("Check whether data meets USI data format standard") count: Dict[str, int] = {} result: VRR = VRR(USI_CHECK_ID) error_prefix = 'Wrong JSON structure:' if type(sample) is not list: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} all data need to be encapsulated in an array", USI_CHECK_ID, "", VRConstant.USI_CHECK)) return result for one in sample: # check the structure, if wrong, could not continue, so directly skip to next record # rather than setting error flag if type(one) is not dict: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} some records are not represented as hashes", USI_CHECK_ID, "", VRConstant.USI_CHECK)) continue if 'alias' not in one: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} some records do not have alias which is " f"mandatory and used to identify record", USI_CHECK_ID, "", VRConstant.USI_CHECK)) continue else: # check existence of mandatory fields alias = one['alias'] if type(alias) is list or type(alias) is dict: result.add_validation_result_column( VRC(VRConstant.ERROR, f"{error_prefix} alias can only be a string", USI_CHECK_ID, "", VRConstant.USI_CHECK)) continue count.setdefault(alias, 0) count[alias] = count[alias] + 1 error_flag = False if 'title' not in one: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} no title field for record with alias as {alias}", USI_CHECK_ID, "", VRConstant.USI_CHECK)) error_flag = True if 'releaseDate' not in one: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} no releaseDate field for record with alias as {alias}", USI_CHECK_ID, "", VRConstant.USI_CHECK)) error_flag = True if 'taxonId' not in one: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} no taxonId field for record with alias as {alias}", USI_CHECK_ID, "", VRConstant.USI_CHECK)) error_flag = True if 'attributes' not in one: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} no attributes for record with alias as {alias}", USI_CHECK_ID, "", VRConstant.USI_CHECK)) error_flag = True # return when previous record has error or current record fails the check above if error_flag: continue # check value of mandatory fields except type of alias # which is checked above and duplicate check outside this loop # taxonId must be an integer if not isinstance(one['taxonId'], int): result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} taxonId value for record {alias} is not an integer", USI_CHECK_ID, "", VRConstant.USI_CHECK)) # releaseDate needs to be in YYYY-MM-DD date_check = misc.get_matched_date(one['releaseDate'], "YYYY-MM-DD") if date_check: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} {date_check} for record with alias value {alias}", USI_CHECK_ID, "", VRConstant.USI_CHECK)) # attributes is a list of attributes, represented as dict attrs = one['attributes'] if type(attrs) is not dict: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} attributes must be stored as a map for record with alias " f"{alias}", USI_CHECK_ID, "", VRConstant.USI_CHECK)) else: for attr_name in attrs: attr_values = attrs[attr_name] if type(attr_values) is not list: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} the values for attribute {attr_name} " f"needs to be in an array for record {alias}", USI_CHECK_ID, "", VRConstant.USI_CHECK)) else: for attr_value in attr_values: if type(attr_value) is not dict: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} the attribute value of {attr_name} needs to be " f"represented as a map in record {alias}", USI_CHECK_ID, "", VRConstant.USI_CHECK)) else: if 'value' not in attr_value: result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} could not find 'value' keyword for attribute" f" {attr_name} in record {alias}", USI_CHECK_ID, "", VRConstant.USI_CHECK)) else: for key in attr_value.keys(): if key != 'value' and key != 'units' and key != 'terms': result.add_validation_result_column( VRC( VRConstant.ERROR, f"{error_prefix} Unrecognized keyword {key} used in " f"attribute {attr_name} in record {alias}", USI_CHECK_ID, "", VRConstant.USI_CHECK)) elif key == 'terms': terms_value = attr_value[key] if type(terms_value) is not list: msg = f"{error_prefix} ontology terms need to be stored " \ f"in an array in record {alias}" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) elif type(terms_value[0] ) is not dict or ( 'url' not in terms_value[0]): msg = f"{error_prefix} url not used as key for ontology term " \ f"in record {alias}" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) # optional field existing_relationships: Dict[str, str] = dict() existing_keyword: str = '' if 'sampleRelationships' in one: relationships = one['sampleRelationships'] if type(relationships) is not list: msg = f"{error_prefix} sampleRelationships field must have values within an array " \ f"for record with alias {alias}" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) else: for relationship in relationships: if type(relationship) is not dict: msg = f"{error_prefix} relationship needs to be presented as a hash " \ f"for record with alias {alias}" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) else: if len( relationship.keys() ) == 2: # relationship must have two and only two elements if ('alias' in relationship or 'accession' in relationship) \ and 'relationshipNature' in relationship: relationship_nature = relationship[ 'relationshipNature'] if relationship_nature not in ALLOWED_RELATIONSHIP_NATURE: msg = f"{error_prefix} Unrecognized relationship nature {relationship_nature} " \ f"within record {alias}" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) else: if relationship_nature != 'same as' and relationship_nature != 'recurated from' \ and existing_keyword != relationship_nature \ and existing_keyword != 'same as'\ and existing_keyword != 'recurated from': if len(existing_keyword) == 0: existing_keyword = relationship_nature else: msg = f"{error_prefix} More than one relationship natures found " \ f"within record {alias}" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) if 'alias' in relationship: target = relationship['alias'] is_biosample = misc.is_biosample_record( target) if is_biosample: msg = f"{error_prefix} In relationship alias can only " \ f"take non-BioSamples accession, not {target}" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) else: target = relationship['accession'] is_biosample = misc.is_biosample_record( target) if not is_biosample: msg = f"{error_prefix} In relationship accession can only " \ f"take BioSamples accession, not {target}" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) if target in existing_relationships: # already found this in msg = f"Duplicated relationship {relationship_nature} with {target}" \ f" for record {alias}" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) existing_relationships[ target] = relationship[ 'relationshipNature'] else: msg = f"{error_prefix} Unrecognized key used (only can be alias/accession and " \ f"relationshipNature) within one relationship. Affected record {alias}" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) else: msg = f"{error_prefix} two and only two keys (alias/accession and relationshipNature) " \ f"must be presented within every relationship. Affected record {alias}" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) for key in count.keys(): if count[key] > 1: msg = f"There are more than one record having {key} as its alias" result.add_validation_result_column( VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "", VRConstant.USI_CHECK)) return result