def organism_part_sex_check(sample: Dict, animal: Dict,
                            existing_results: VRR) -> VRR:
    """
    Context validation to check organism part matches sex, i.e. s***n only from male animal
    For annotated with unknown sex, a Warning will be raised
    :param sample: the sample record
    :param animal: the derived from animal record
    :param existing_results: the existing validation result
    :return: the updated validation result
    """
    sex: str = animal['attributes']['Sex'][0]['value']
    organism_part_ontology = misc.extract_ontology_id_from_iri(
        sample['attributes']['Organism part'][0]['terms'][0]['url'])
    if organism_part_ontology == 'UBERON_0001968':  #s***n
        if sex.lower() == "female":
            existing_results.add_validation_result_column(
                VRC(
                    VRConstant.ERROR,
                    "Organism part (S***n) could not be taken from a female animal",
                    existing_results.record_id, "organism part",
                    VRConstant.CONTEXT))
        # the third sex opiton is 'record of unknown sex'
        elif 'unknown sex' in sex.lower():
            existing_results.add_validation_result_column(
                VRC(
                    VRConstant.WARNING,
                    "Organism part (S***n) is expected to be taken from a male animal, "
                    "please check the sex value (record of unknown sex) is correct",
                    existing_results.record_id, "organism part",
                    VRConstant.CONTEXT))
    return existing_results
示例#2
0
    def validate(self, attributes: Dict, record_id: str,
                 id_field: str) -> List[VRC]:
        """
        Validate the record using all field rules in the section
        :param attributes: the record attribute values
        :param record_id: the id of the record
        :param id_field: the name of the id field
        :return: list of field validaitn results
        """
        results: List[VRC] = []
        # all mandatory fields must be there, not checking details in this step
        if 'mandatory' in self.rules:
            mandatory_rules = self.rules['mandatory']
            for field_name in mandatory_rules.keys():
                if field_name == id_field:
                    continue
                if field_name not in attributes:
                    msg = f"Mandatory field {field_name} in {self.get_section_name()} section could not be found"
                    results.append(
                        VRC(VRConstants.ERROR, msg, record_id, field_name))
            if results:
                return results
        # check values for all required levels
        for required in self.rules.keys():
            rules = self.rules[required]
            for field_name in rules.keys():
                if field_name in attributes:
                    one_field_result = rules[field_name].validate(
                        attributes[field_name], self.get_section_name(),
                        record_id)
                    for tmp in one_field_result:
                        results.append(tmp)

        return results
示例#3
0
 def load_data(self, data_file: str, section: str = '') -> VRR:
     """
     Load the data from JSON file which is to be validated and
     do preliminary validation (usi structure and duplicate), if successful set data ready flag
     The preliminary validation results are stored in the general_errors class field
     :param data_file: the JSON file contains the data
     :param section: optional, the name of the section which contains data
     """
     self.data_ready_flag = False
     general_errors = VRR("general")
     try:
         with open(data_file) as infile:
             self.data = json.load(infile)
     except FileNotFoundError:
         msg = f"Could not find the file {data_file}"
         general_errors.add_validation_result_column(
             VRC(VRConstants.ERROR, msg, general_errors.record_id, "",
                 VRConstants.GENERAL))
         return general_errors
     except json.decoder.JSONDecodeError:
         msg = f"The provided file {data_file} is not a valid JSON file."
         general_errors.add_validation_result_column(
             VRC(VRConstants.ERROR, msg, general_errors.record_id, "",
                 VRConstants.GENERAL))
         return general_errors
     if len(section) > 0:
         if section in self.data:
             self.data = self.data[section]
     # check usi structure
     usi_check_result = validation.check_usi_structure(self.data)
     if usi_check_result.get_overall_status() != "Pass":
         return usi_check_result
     # check duplicate id
     msgs = validation.check_duplicates(self.data, self.id_field)
     if msgs:
         for msg in msgs:
             # classify the error as ruleset based error
             # as it is implicitly required that id field holds unique values
             general_errors.add_validation_result_column(
                 VRC(VRConstants.ERROR, msg, general_errors.record_id,
                     self.id_field, VRConstants.RELATIONSHIP))
         return general_errors
     logger.info("All sample records have unique data source ids")
     self.data_ready_flag = True
     return general_errors
def context_validation(record: Dict,
                       existing_results: VRR,
                       related: List = None) -> VRR:
    """
    do validation based on context, i.e. value in one field affects allowed values in another field
    or involve more than one record
    :param record: the record data
    :param existing_results: the existing validation result
    :param related: list of the related records either parents or related animal, could be empty list
    :return: updated validation result
    """
    existing_results = coordinate_check(record['attributes'], existing_results)
    existing_results = species_check(record, existing_results)
    record_id = existing_results.record_id
    # existing related records, i.e. having relationships
    if related:
        material = record['attributes']['Material'][0]['value']
        if material == "organism":
            if len(related) > 2:
                existing_results.add_validation_result_column(
                    VRC(
                        VRConstant.ERROR,
                        "Having more than 2 parents defined in sampleRelationships",
                        existing_results.record_id, "sampleRelationships",
                        VRConstant.CONTEXT))
            else:
                existing_results = child_of_check(record, related,
                                                  existing_results)
                if len(related) == 2:
                    existing_results = parents_sex_check(
                        related, existing_results)
        else:
            if len(related) != 1:
                existing_results.add_validation_result_column(
                    VRC(VRConstant.ERROR,
                        "Specimen can only derive from one animal", record_id,
                        "sampleRelationships", VRConstant.CONTEXT))
            else:
                existing_results = animal_sample_check(record, related[0],
                                                       existing_results)

    return existing_results
def species_breed_check(animal: Dict, existing_results: VRR) -> VRR:
    """
    check whether mapped breed (recommended) matches species
    if mapped breed not found, gives a warning saying no check has been carried out on supplied breed (mandatory)
    :param animal: the animal record to be validated
    :param existing_results: the existing validation result
    :return: the updated validation result
    """
    attrs = animal['attributes']
    # get root breed ontology term based on given species
    species = attrs[SPECIES][0]['value']
    general_breed_from_species: str = use_ontology.get_general_breed_by_species(
        species)
    general_breed_term = general_breed_from_species['ontologyTerms'].rsplit(
        "/", 1)[1]
    if 'Mapped breed' in attrs:
        mapped_breed = attrs['Mapped breed'][0]['terms'][0]['url']
        match = static_parameters.ontology_library.has_parent(
            mapped_breed, general_breed_term)
        if not match:
            general_crossbreed_from_species = use_ontology.get_general_breed_by_species(
                species, cross=True)
            general_crossbreed_term = general_crossbreed_from_species[
                'ontologyTerms'].rsplit("/", 1)[1]
            match = static_parameters.ontology_library.has_parent(
                mapped_breed, general_crossbreed_term)
            if not match:
                existing_results.add_validation_result_column(
                    VRC(
                        VRConstant.ERROR,
                        f"The mapped breed {mapped_breed} does not match the given species {species}",
                        existing_results.record_id, "Mapped breed",
                        VRConstant.CONTEXT))
    else:
        existing_results.add_validation_result_column(
            VRC(
                VRConstant.WARNING,
                f"No check has been carried out on whether "
                f"{attrs['Supplied breed'][0]['value']} is a {species} breed as no mapped breed provided",
                existing_results.record_id, "Supplied breed",
                VRConstant.CONTEXT))
    return existing_results
示例#6
0
    def validate(self, record: Dict, id_field: str = 'Data source ID') -> VRR:
        """
        Validate the record with the full ruleset
        :param record: the record data
        :param id_field: the name of the id field, in IMAGE ruleset it is Data source ID
        :return: the validation result
        """
        logger.debug(f"got record: {record}, id_field: {id_field}")
        attributes = record['attributes']
        record_id = attributes[id_field][0]['value']
        record_result = VRR(record_id)

        unmapped = attributes.copy(
        )  # create a copy and remove the ruleset-mapped columns
        del unmapped[id_field]
        for section_name in self.get_all_section_names():
            logger.debug(f"Processing section_name: {section_name}")
            section_rule = self.get_section_by_name(section_name)
            # logger.debug("Got section_rule: %s" % (section_rule.toJSON()))
            if section_rule.meet_condition(record):
                logger.debug("Applying " + section_name +
                             " ruleset to record " + record_id)
                section_results = section_rule.validate(
                    attributes, record_id, id_field)
                for one in section_results:
                    record_result.add_validation_result_column(one)

                for field_name in section_rule.get_rule_names():
                    if field_name in unmapped:
                        del unmapped[field_name]

            else:
                logger.debug("section_rule %s doesn't meet_condition" %
                             section_name)

        # unmapped column check can only be done here, not in section rule
        # validation as all section rules need to apply
        if unmapped:
            logger.debug("found those unmapped keys: %s" % (unmapped.keys()))

            for key in unmapped.keys():
                record_result.add_validation_result_column(
                    VRC(VRConstants.WARNING,
                        f"Column {key} could not be found in ruleset",
                        record_id, key))
        else:
            logger.debug("No unmapped columns left")

        return record_result
def parents_sex_check(related: List[Dict], existing_results: VRR) -> VRR:
    """
    Context validation to check whether the two annotated parents have two different genders
    For annotated with unknown sex, a Warning will be raised
    :param related: the list of two parent animals
    :param existing_results: the existing validation result
    :return: the updated validation result
    """
    one_sex: str = related[0]['attributes']['Sex'][0]['value']
    another_sex: str = related[1]['attributes']['Sex'][0]['value']
    unknown_flag = False
    if "unknown sex" in one_sex.lower() or "unknown sex" in another_sex.lower(
    ):
        unknown_flag = True
        existing_results.add_validation_result_column(
            VRC(
                VRConstant.WARNING,
                "At least one parent has unknown value for sex, thus could not be checked",
                existing_results.record_id, "parents sex", VRConstant.CONTEXT))
    if not unknown_flag and one_sex == another_sex:
        existing_results.add_validation_result_column(
            VRC(VRConstant.ERROR, "Two parents could not have same sex",
                existing_results.record_id, "parents sex", VRConstant.CONTEXT))
    return existing_results
def coordinate_check(record: Dict, existing_results: VRR) -> VRR:
    """
    Context validation to check whether value in the place field matches to the value in the accuracy field
    :param record: the record data
    :param existing_results: the existing validation result
    :return: the updated validation result
    """
    if type(record) is not dict:
        raise TypeError("record needs to be a record represented as a Dict")
    if type(existing_results) is not VRR:
        raise TypeError(
            "The existing results parameter needs to be a ValidationResultRecord object"
        )
    material = record['Material'][0]['value']
    if material == "organism":
        place_field_name = "Birth location"
    else:
        place_field_name = "Collection place"
    place_accuracy_field_name = place_field_name + " accuracy"
    if place_field_name not in record:
        if record[place_accuracy_field_name][0][
                'value'] != "missing geographic information":
            msg = f"No value provided for field {place_field_name} but value in field" \
                f" {place_accuracy_field_name} is not missing geographic information"
            existing_results.add_validation_result_column(
                VRC(VRConstant.ERROR, msg, existing_results.record_id,
                    place_field_name, VRConstant.CONTEXT))
    else:
        if record[place_accuracy_field_name][0][
                'value'] == "missing geographic information":
            msg = f"Value {record[place_field_name][0]['value']} provided for field {place_field_name} " \
                f"but value in field {place_accuracy_field_name} is missing geographic information"
            existing_results.add_validation_result_column(
                VRC(VRConstant.ERROR, msg, existing_results.record_id,
                    place_field_name, VRConstant.CONTEXT))
    return existing_results
def species_check(record: Dict, existing_results: VRR) -> VRR:
    """
    Context validation to check when species specified in the USI structure matches the species field
    :param record: the record data
    :param existing_results: the existing validation result
    :return: the updated validation result
    """
    taxon_id = record['taxonId']
    url = record['attributes'][SPECIES][0]['terms'][0]['url']
    if not url.endswith(str(taxon_id)):
        existing_results.add_validation_result_column(
            VRC(
                VRConstant.ERROR,
                f"taxonId {taxon_id} does not match ontology term used in species {url}",
                existing_results.record_id, "taxonomy", VRConstant.CONTEXT))
    return existing_results
示例#10
0
def check_value_equal(source: Dict, target: Dict, existing_results: VRR,
                      field: str) -> VRR:
    target_field_value = target['attributes'][field][0]['value']
    source_field_value = source['attributes'][field][0]['value']
    source_label = 'sample'
    target_label = 'related animal'
    if source['attributes']['Material'][0]['value'] == 'organism':
        source_label = 'child'
        target_label = 'parent'

    if target_field_value != source_field_value:
        record_id = existing_results.record_id
        existing_results.add_validation_result_column(
            VRC(
                VRConstant.ERROR,
                f"The {field} of {source_label} ({source_field_value}) does not "
                f"match to the {field} of {target_label} ({target_field_value})",
                record_id, field, VRConstant.CONTEXT))
    return existing_results
示例#11
0
 def load_ruleset(self, ruleset_file: str) -> VRR:
     """
     Load the ruleset from the JSON file and check the integrity of the ruleset,
     if successful, set ruleset ready flag
     if not, the results are stored in the class field general_errors
     :param ruleset_file: the JSON file containing the ruleset
     """
     self.ruleset_pass_flag = False
     general_errors = VRR("general")
     try:
         self.ruleset = validation.read_in_ruleset(ruleset_file)
     except KeyError as e:
         general_errors.add_validation_result_column(
             VRC(VRConstants.ERROR, str(e), general_errors.record_id, "",
                 VRConstants.GENERAL))
         return general_errors
     ruleset_check_result: VRR = validation.check_ruleset(self.ruleset)
     if ruleset_check_result.get_overall_status() != "Pass":
         return ruleset_check_result
     logger.info("Ruleset loaded")
     self.ruleset_pass_flag = True
     return general_errors
示例#12
0
    def validate(self, entries, section_name: str, record_id: str):
        """
        Validate values of one field against the ruleset for that field
        :param entries: field data
        :param section_name: the section the field belong to
        :param record_id: the id of the record
        :return: list of validation result represented as validation column result list
        """
        results: List[VRC] = []
        section_info: str = " (" + section_name + " section)"
        mandatory = False
        if self.required == 'mandatory':
            mandatory = True

        has_error = False
        # check cardinality
        entry_size: int = len(entries)
        if entry_size == 0:
            if mandatory:
                msg = f"Mandatory field {self.name} has empty value"
                results.append(
                    VRC(VRConstants.ERROR, msg + section_info, record_id,
                        self.name))
                has_error = True
            else:
                msg = f"{self.required} field {self.name} has empty value, better remove the field"
                results.append(
                    VRC(VRConstants.WARNING, msg + section_info, record_id,
                        self.name))
        elif entry_size > 1:
            if not self.allow_multiple():
                msg = f"Multiple values supplied for field {self.name} which does not allow multiple values"
                results.append(
                    VRC(VRConstants.ERROR, msg + section_info, record_id,
                        self.name))
                has_error = True
            # multiple only be True (reaching here) when existing Allow Multiple, no need to check existence
            if entry_size > 2 and self.get_multiple() == 'max 2':
                msg = f"Maximum of 2 values allowed for field {self.name} but {str(entry_size)} values provided"
                results.append(
                    VRC(VRConstants.ERROR, msg + section_info, record_id,
                        self.name))
                has_error = True
        # the errors detected above mean that there is no need to validate the actual value(s)
        if has_error:
            return results

        for entry in entries:
            value = entry['value']
            # check units
            allowed_units = self.get_allowed_units()
            allowed_units_str = ', '.join(allowed_units)
            if 'units' in entry:
                if allowed_units:
                    if entry['units'] not in allowed_units:
                        msg = f"{entry['units']} for field {self.name} is not " \
                            f"in the valid units list ({allowed_units_str})"
                        results.append(
                            VRC(VRConstants.ERROR, msg + section_info,
                                record_id, self.name))
                else:  # unit not required, but exists, raise a warning
                    msg = f"No units required but {entry['units']} is used as unit for field {self.name}"
                    results.append(
                        VRC(VRConstants.WARNING, msg + section_info, record_id,
                            self.name))
            else:
                if allowed_units:
                    msg = f"One of {allowed_units_str} need to be present for the field {self.name}"
                    results.append(
                        VRC(VRConstants.ERROR, msg + section_info, record_id,
                            self.name))
            # check allowed values
            allowed_values = self.get_allowed_values()
            if allowed_values:
                if value not in allowed_values:
                    if self.name == "Availability":
                        # available valid values include [email protected] and no longer available, needs to check for email
                        if not misc.is_url(value):
                            msg = f'<{value}> of field Availability is neither "no longer available" nor a valid URI'
                            results.append(
                                VRC(VRConstants.ERROR, msg + section_info,
                                    record_id, self.name))
                    else:  # not availability
                        allowed_values_str = '>, <'.join(allowed_values)
                        msg = f"<{value}> of field {self.name} is not in the valid values list (<{allowed_values_str}>)"
                        results.append(
                            VRC(VRConstants.ERROR, msg + section_info,
                                record_id, self.name))
            if results:
                return results

            if 'terms' in entry:
                if not self.get_allowed_terms():  # allowed conditions empty
                    msg = f"Ontology provided for field {self.name} however there is no requirement in the ruleset"
                    results.append(
                        VRC(VRConstants.WARNING, msg + section_info, record_id,
                            self.name))
                else:
                    for term in entry['terms']:
                        iri = term['url']
                        if not misc.is_url(iri):
                            msg = f"Invalid URI value {iri} in field {self.name}"
                            results.append(
                                VRC(VRConstants.ERROR, msg + section_info,
                                    record_id, self.name))
                            continue

                        term_id = misc.extract_ontology_id_from_iri(iri)
                        if not self.check_ontology_allowed(term_id):
                            msg = f"Not valid ontology term {term_id} in field {self.name}"
                            results.append(
                                VRC(VRConstants.ERROR, msg + section_info,
                                    record_id, self.name))
            if results:
                return results

            # check type
            # current allowed types:
            # numeric: number
            # textual: text, limited value, ontology_id, uri, doi, date
            # number type requires a unit, which is covered in the units check above
            if self.type == 'number':
                if type(value) is not float and type(value) is not int:
                    msg = f"For field {self.name} the provided value {str(value)} is not represented " \
                        f"as/of the expected type Number"
                    results.append(
                        VRC(VRConstants.ERROR, msg + section_info, record_id,
                            self.name))
            else:  # textual types
                if type(value) is not str:
                    msg = f"For field {self.name} the provided value {str(value)} " \
                        f"is not of the expected type {self.type}"
                    results.append(
                        VRC(VRConstants.ERROR, msg + section_info, record_id,
                            self.name))
                    return results
                # the following tests are based on the value is a string, so need to return above
                if self.type == 'ontology_id':
                    if 'terms' not in entry:
                        msg = f"No url found for the field {self.name} which has the type of ontology_id"
                        results.append(
                            VRC(VRConstants.ERROR, msg + section_info,
                                record_id, self.name))
                    else:
                        for term in entry['terms']:
                            iri = term['url']
                            term = misc.extract_ontology_id_from_iri(iri)
                            ontology = static_parameters.ontology_library.get_ontology(
                                term)
                            if iri != ontology.get_iri():
                                msg = f"Provided iri {iri} does not match the iri " \
                                    f"retrieved from OLS in the field {self.name}"
                                results.append(
                                    VRC(VRConstants.WARNING,
                                        msg + section_info, record_id,
                                        self.name))
                            if not ontology.label_match_ontology(value):
                                if ontology.label_match_ontology(value, False):
                                    msg = f"Provided value {value} has different letter case" \
                                        f" to the term referenced by {iri}"
                                    results.append(
                                        VRC(VRConstants.WARNING,
                                            msg + section_info, record_id,
                                            self.name))
                                else:
                                    msg = f"Provided value {value} does not match to the provided ontology {iri}"
                                    results.append(
                                        VRC(VRConstants.ERROR,
                                            msg + section_info, record_id,
                                            self.name))
                elif self.type == "uri":
                    url_result = misc.is_url(value)
                    if not url_result:
                        msg = f"Invalid URI value {value} for field {self.name}"
                        results.append(
                            VRC(VRConstants.ERROR, msg + section_info,
                                record_id, self.name))
                    else:  # is in URI
                        # in image ruleset, when email provided, it must begin with mailto:
                        if misc.is_email(value):
                            if misc.is_email(
                                    value, True
                            ):  # the whole value of value is an email, which is wrong
                                msg = f'Email address must have prefix "mailto:" in the field {self.name}'
                                results.append(
                                    VRC(VRConstants.ERROR, msg + section_info,
                                        record_id, self.name))
                        else:  # it is URL, but not email: could be a normal URL or wrong mailto: location
                            if value.find("mailto:") > 0:
                                msg = f"mailto must be at position 1 to be a valid email value in the field {self.name}"
                                results.append(
                                    VRC(VRConstants.ERROR, msg + section_info,
                                        record_id, self.name))
                elif self.type == 'doi':
                    doi_result = misc.is_doi(value)
                    if not doi_result:
                        msg = f"Invalid DOI value supplied in the field {self.name}"
                        results.append(
                            VRC(VRConstants.ERROR, msg + section_info,
                                record_id, self.name))
                elif self.type == 'date':
                    # there is always a format(unit) for the date type (checked in the validation.read_in_ruleset)
                    # therefore entry[units] existence should have already been
                    # if 'units' not in entry:
                    date_format = entry['units']
                    date_result = misc.get_matched_date(value, date_format)
                    if date_result:
                        results.append(
                            VRC(VRConstants.ERROR, date_result + section_info,
                                record_id, self.name))

            # it would be safer to skip the validations below as unmatched type detected
            if results:
                return results

        return results
示例#13
0
    def validate(self) -> None:
        """
        Validate the data against the ruleset
        the data needs to be scanned twice, first time to validate individual field in every record based on the ruleset
        second time to validate anything involving with more than one record e.g. relationships and context validation
        or more than one fields in the same record
        The validation result is stored in the object's validation_results field
        """
        # check whether ready to carry out validation
        if not self.data_ready_flag:
            logger.error(
                "The data is not ready, abort the validation proecess")
            return
        if not self.ruleset_pass_flag:
            logger.error(
                "The ruleset is not ready, abort the validation proecess")
            return
        # first scan
        # meanwhile split data according to material type for relationship checking
        data_by_material: Dict[str, Dict[str, Dict]] = {}
        for record in self.data:
            logger.info("Validate record " + record['alias'])
            record_result = self.ruleset.validate(record)
            self.validation_results[record['alias']] = record_result
            try:
                material = record['attributes']['Material'][0]['value'].lower()
                data_by_material.setdefault(material, {})
                data_by_material[material][record['alias']] = record
            except KeyError:
                # error still exists, e.g. using material rather than Material, which needs to be caught
                # however the reporting should have already been done by validation
                pass

        for record in self.data:
            # if the record is with status Error, no more validation will be done
            if self.validation_results[
                    record['alias']].get_overall_status() == VRConstants.ERROR:
                continue
            record_result = self.validation_results[record['alias']]
            record_id = record['attributes'][self.id_field][0]['value']
            # check relationship
            relationships = record.get('sampleRelationships', [])
            related: List[Dict] = []
            for relationship in relationships:
                if 'accession' in relationship:
                    # target is biosample accession which is checked in validation.check_usi_structure
                    target = relationship['accession']
                    url = f"https://www.ebi.ac.uk/biosamples/samples/{target}"
                    response = requests.get(url)
                    status = response.status_code
                    if status != 200:
                        record_result.add_validation_result_column(
                            VRC(
                                VRConstants.WARNING,
                                f"Fail to retrieve record {target} from "
                                f"BioSamples as required in the relationship",
                                record_id, 'sampleRelationships',
                                VRConstants.GENERAL))
                    else:
                        # at the moment, no any IMAGE data in BioSamples
                        # check project = IMAGE
                        # parse into memory
                        pass
                else:
                    # in the current ruleset, derived from only from organism to specimen,
                    # so safe to only check organism
                    target: str = relationship['alias']
                    if target in data_by_material['organism']:
                        related.append(
                            dict(data_by_material['organism'][target]))
                    else:
                        record_result.add_validation_result_column(
                            VRC(
                                VRConstants.ERROR,
                                f"Could not locate the referenced record {target}",
                                record_id, 'sampleRelationships',
                                VRConstants.RELATIONSHIP))
                self.validation_results[record['alias']] = record_result

            # if error found during relationship checking, skip context validation
            # because some context validation (relationship check etc) could not be carried out
            if self.validation_results[
                    record['alias']].get_overall_status() == VRConstants.ERROR:
                continue

            record_result = validation.context_validation(
                record, record_result, related)

            if record_result.is_empty():
                record_result.add_validation_result_column(
                    VRC("Pass", "", record_result.record_id, "",
                        VRConstants.EMPTY))
            self.validation_results[record['alias']] = record_result
示例#14
0
def check_ruleset(ruleset: Ruleset.RuleSet) -> VRR:
    """
    Validate the ruleset itself is a valid ruleset
    :param ruleset: the ruleset to be validated
    :return: the list of errors, if ruleset is valid, the list is empty
    """
    if type(ruleset) is not Ruleset.RuleSet:
        raise TypeError("The parameter must be of a RuleSet object")
    # conditions
    results: VRR = VRR(RULESET_CHECK_ID)
    for section_name in ruleset.get_all_section_names():
        section_rule: Ruleset.RuleSection = ruleset.get_section_by_name(
            section_name)
        rules_in_section = section_rule.get_rules()
        for required in rules_in_section.keys():
            for rule_name in rules_in_section[required].keys():
                rule = rules_in_section[required][rule_name]
                rule_type = rule.get_type()
                if rule.get_allowed_values():  # allowed values provided
                    if rule_type == "ontology_id" or rule_type == "text":
                        msg = f"No valid values should be provided to field " \
                            f"{rule.get_name()} as being of {rule_type} type"
                        results.add_validation_result_column(
                            VRC(VRConstant.ERROR, msg, RULESET_CHECK_ID,
                                rule_name, VRConstant.RULESET_CHECK))
                else:  # no allowed values provided
                    if rule_type == "limited value":
                        msg = f"There is no allowed values for field {rule.get_name()} being of {rule_type} type"
                        results.add_validation_result_column(
                            VRC(VRConstant.ERROR, msg, RULESET_CHECK_ID,
                                rule_name, VRConstant.RULESET_CHECK))

                if rule.get_allowed_units():  # units provided
                    if rule_type != "number" and rule_type != "date":
                        msg = f"Valid units provided for field {rule.get_name()} " \
                            f"having type as {rule_type} which does not expect units"
                        results.add_validation_result_column(
                            VRC(VRConstant.ERROR, msg, RULESET_CHECK_ID,
                                rule_name, VRConstant.RULESET_CHECK))
                else:  # no units provided
                    if rule_type == "number" or rule_type == "date":
                        msg = f"Field {rule.get_name()} has type as {rule_type} but no valid units provided"
                        results.add_validation_result_column(
                            VRC(VRConstant.ERROR, msg, RULESET_CHECK_ID,
                                rule_name, VRConstant.RULESET_CHECK))

                if rule.get_allowed_terms():  # ontology terms provided
                    if rule_type != "ontology_id":
                        msg = f"Ontology terms are provided for field {rule.get_name()}. " \
                            f"Please re-consider whether it needs to change to ontology_id type."
                        results.add_validation_result_column(
                            VRC(VRConstant.WARNING, msg, RULESET_CHECK_ID,
                                rule_name, VRConstant.RULESET_CHECK))
                else:  # no ontology provided
                    if rule_type == "ontology_id":
                        msg = f"No valid terms provided to field {rule.get_name()} " \
                            f"which is essential to be of ontology_id type"
                        results.add_validation_result_column(
                            VRC(VRConstant.ERROR, msg, RULESET_CHECK_ID,
                                rule_name, VRConstant.RULESET_CHECK))

    return results
示例#15
0
def check_usi_structure(sample: List[Dict]) -> VRR:
    """
    Check whether the record values are represented in the format which can be submitted via USI
    This also guarantees that the following validation does not need to worry about how the data is represented
    This function also checks whether more than one record use the same alias (USI requirement)
    :param sample: the records represented in JSON
    :return: the list of error messages
    """
    logger.debug("Check whether data meets USI data format standard")
    count: Dict[str, int] = {}
    result: VRR = VRR(USI_CHECK_ID)
    error_prefix = 'Wrong JSON structure:'
    if type(sample) is not list:
        result.add_validation_result_column(
            VRC(
                VRConstant.ERROR,
                f"{error_prefix} all data need to be encapsulated in an array",
                USI_CHECK_ID, "", VRConstant.USI_CHECK))
        return result
    for one in sample:
        # check the structure, if wrong, could not continue, so directly skip to next record
        # rather than setting error flag
        if type(one) is not dict:
            result.add_validation_result_column(
                VRC(
                    VRConstant.ERROR,
                    f"{error_prefix} some records are not represented as hashes",
                    USI_CHECK_ID, "", VRConstant.USI_CHECK))
            continue
        if 'alias' not in one:
            result.add_validation_result_column(
                VRC(
                    VRConstant.ERROR,
                    f"{error_prefix} some records do not have alias which is "
                    f"mandatory and used to identify record", USI_CHECK_ID, "",
                    VRConstant.USI_CHECK))
            continue
        else:
            # check existence of mandatory fields
            alias = one['alias']
            if type(alias) is list or type(alias) is dict:
                result.add_validation_result_column(
                    VRC(VRConstant.ERROR,
                        f"{error_prefix} alias can only be a string",
                        USI_CHECK_ID, "", VRConstant.USI_CHECK))
                continue
            count.setdefault(alias, 0)
            count[alias] = count[alias] + 1

        error_flag = False
        if 'title' not in one:
            result.add_validation_result_column(
                VRC(
                    VRConstant.ERROR,
                    f"{error_prefix} no title field for record with alias as {alias}",
                    USI_CHECK_ID, "", VRConstant.USI_CHECK))
            error_flag = True
        if 'releaseDate' not in one:
            result.add_validation_result_column(
                VRC(
                    VRConstant.ERROR,
                    f"{error_prefix} no releaseDate field for record with alias as {alias}",
                    USI_CHECK_ID, "", VRConstant.USI_CHECK))
            error_flag = True
        if 'taxonId' not in one:
            result.add_validation_result_column(
                VRC(
                    VRConstant.ERROR,
                    f"{error_prefix} no taxonId field for record with alias as {alias}",
                    USI_CHECK_ID, "", VRConstant.USI_CHECK))
            error_flag = True
        if 'attributes' not in one:
            result.add_validation_result_column(
                VRC(
                    VRConstant.ERROR,
                    f"{error_prefix} no attributes for record with alias as {alias}",
                    USI_CHECK_ID, "", VRConstant.USI_CHECK))
            error_flag = True
        # return when previous record has error or current record fails the check above
        if error_flag:
            continue
        # check value of mandatory fields except type of alias
        # which is checked above and duplicate check outside this loop
        # taxonId must be an integer
        if not isinstance(one['taxonId'], int):
            result.add_validation_result_column(
                VRC(
                    VRConstant.ERROR,
                    f"{error_prefix} taxonId value for record {alias} is not an integer",
                    USI_CHECK_ID, "", VRConstant.USI_CHECK))
        # releaseDate needs to be in YYYY-MM-DD
        date_check = misc.get_matched_date(one['releaseDate'], "YYYY-MM-DD")
        if date_check:
            result.add_validation_result_column(
                VRC(
                    VRConstant.ERROR,
                    f"{error_prefix} {date_check} for record with alias value {alias}",
                    USI_CHECK_ID, "", VRConstant.USI_CHECK))
        # attributes is a list of attributes, represented as dict
        attrs = one['attributes']
        if type(attrs) is not dict:
            result.add_validation_result_column(
                VRC(
                    VRConstant.ERROR,
                    f"{error_prefix} attributes must be stored as a map for record with alias "
                    f"{alias}", USI_CHECK_ID, "", VRConstant.USI_CHECK))
        else:
            for attr_name in attrs:
                attr_values = attrs[attr_name]
                if type(attr_values) is not list:
                    result.add_validation_result_column(
                        VRC(
                            VRConstant.ERROR,
                            f"{error_prefix} the values for attribute {attr_name} "
                            f"needs to be in an array for record {alias}",
                            USI_CHECK_ID, "", VRConstant.USI_CHECK))
                else:
                    for attr_value in attr_values:
                        if type(attr_value) is not dict:
                            result.add_validation_result_column(
                                VRC(
                                    VRConstant.ERROR,
                                    f"{error_prefix} the attribute value of {attr_name} needs to be "
                                    f"represented as a map in record {alias}",
                                    USI_CHECK_ID, "", VRConstant.USI_CHECK))
                        else:
                            if 'value' not in attr_value:
                                result.add_validation_result_column(
                                    VRC(
                                        VRConstant.ERROR,
                                        f"{error_prefix} could not find 'value' keyword for attribute"
                                        f" {attr_name} in record {alias}",
                                        USI_CHECK_ID, "",
                                        VRConstant.USI_CHECK))
                            else:
                                for key in attr_value.keys():
                                    if key != 'value' and key != 'units' and key != 'terms':
                                        result.add_validation_result_column(
                                            VRC(
                                                VRConstant.ERROR,
                                                f"{error_prefix} Unrecognized keyword {key} used in "
                                                f"attribute {attr_name} in record {alias}",
                                                USI_CHECK_ID, "",
                                                VRConstant.USI_CHECK))
                                    elif key == 'terms':
                                        terms_value = attr_value[key]
                                        if type(terms_value) is not list:
                                            msg = f"{error_prefix} ontology terms need to be stored " \
                                                f"in an array in record {alias}"
                                            result.add_validation_result_column(
                                                VRC(VRConstant.ERROR, msg,
                                                    USI_CHECK_ID, "",
                                                    VRConstant.USI_CHECK))
                                        elif type(terms_value[0]
                                                  ) is not dict or (
                                                      'url'
                                                      not in terms_value[0]):
                                            msg = f"{error_prefix} url not used as key for ontology term " \
                                                f"in record {alias}"
                                            result.add_validation_result_column(
                                                VRC(VRConstant.ERROR, msg,
                                                    USI_CHECK_ID, "",
                                                    VRConstant.USI_CHECK))

        # optional field
        existing_relationships: Dict[str, str] = dict()
        existing_keyword: str = ''
        if 'sampleRelationships' in one:
            relationships = one['sampleRelationships']
            if type(relationships) is not list:
                msg = f"{error_prefix} sampleRelationships field must have values within an array " \
                    f"for record with alias {alias}"
                result.add_validation_result_column(
                    VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "",
                        VRConstant.USI_CHECK))
            else:
                for relationship in relationships:
                    if type(relationship) is not dict:
                        msg = f"{error_prefix} relationship needs to be presented as a hash " \
                            f"for record with alias {alias}"
                        result.add_validation_result_column(
                            VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "",
                                VRConstant.USI_CHECK))
                    else:
                        if len(
                                relationship.keys()
                        ) == 2:  # relationship must have two and only two elements
                            if ('alias' in relationship or 'accession' in relationship) \
                                    and 'relationshipNature' in relationship:
                                relationship_nature = relationship[
                                    'relationshipNature']
                                if relationship_nature not in ALLOWED_RELATIONSHIP_NATURE:
                                    msg = f"{error_prefix} Unrecognized relationship nature {relationship_nature} " \
                                        f"within record {alias}"
                                    result.add_validation_result_column(
                                        VRC(VRConstant.ERROR, msg,
                                            USI_CHECK_ID, "",
                                            VRConstant.USI_CHECK))
                                else:
                                    if relationship_nature != 'same as' and relationship_nature != 'recurated from' \
                                            and existing_keyword != relationship_nature \
                                            and existing_keyword != 'same as'\
                                            and existing_keyword != 'recurated from':
                                        if len(existing_keyword) == 0:
                                            existing_keyword = relationship_nature
                                        else:
                                            msg = f"{error_prefix} More than one relationship natures found " \
                                                f"within record {alias}"
                                            result.add_validation_result_column(
                                                VRC(VRConstant.ERROR, msg,
                                                    USI_CHECK_ID, "",
                                                    VRConstant.USI_CHECK))
                                    if 'alias' in relationship:
                                        target = relationship['alias']
                                        is_biosample = misc.is_biosample_record(
                                            target)
                                        if is_biosample:
                                            msg = f"{error_prefix} In relationship alias can only " \
                                                f"take non-BioSamples accession, not {target}"
                                            result.add_validation_result_column(
                                                VRC(VRConstant.ERROR, msg,
                                                    USI_CHECK_ID, "",
                                                    VRConstant.USI_CHECK))
                                    else:
                                        target = relationship['accession']
                                        is_biosample = misc.is_biosample_record(
                                            target)
                                        if not is_biosample:
                                            msg = f"{error_prefix} In relationship accession can only " \
                                                f"take BioSamples accession, not {target}"
                                            result.add_validation_result_column(
                                                VRC(VRConstant.ERROR, msg,
                                                    USI_CHECK_ID, "",
                                                    VRConstant.USI_CHECK))
                                    if target in existing_relationships:  # already found this in
                                        msg = f"Duplicated relationship {relationship_nature} with {target}" \
                                            f" for record {alias}"
                                        result.add_validation_result_column(
                                            VRC(VRConstant.ERROR, msg,
                                                USI_CHECK_ID, "",
                                                VRConstant.USI_CHECK))
                                    existing_relationships[
                                        target] = relationship[
                                            'relationshipNature']
                            else:
                                msg = f"{error_prefix} Unrecognized key used (only can be alias/accession and " \
                                    f"relationshipNature) within one relationship. Affected record {alias}"
                                result.add_validation_result_column(
                                    VRC(VRConstant.ERROR, msg, USI_CHECK_ID,
                                        "", VRConstant.USI_CHECK))
                        else:
                            msg = f"{error_prefix} two and only two keys (alias/accession and relationshipNature) " \
                                f"must be presented within every relationship. Affected record {alias}"
                            result.add_validation_result_column(
                                VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "",
                                    VRConstant.USI_CHECK))

    for key in count.keys():
        if count[key] > 1:
            msg = f"There are more than one record having {key} as its alias"
            result.add_validation_result_column(
                VRC(VRConstant.ERROR, msg, USI_CHECK_ID, "",
                    VRConstant.USI_CHECK))
    return result