def check(self, dir, args): warnings = [] log.info("Running biobank fields checks (BiobankFields)") for biobank in dir.getBiobanks(): if not 'juridical_person' in biobank or re.search( '^\s*$', biobank['juridical_person']) or re.search( '^\s*N/?A\s*$', biobank['juridical_person']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Missing juridical person ('juridical_person' attribute is empty)" )) if (not 'head_firstname' in biobank or re.search('^\s*$', biobank['head_firstname']) or not 'head_lastname' in biobank or re.search('^\s*$', biobank['head_lastname'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.WARNING, biobank['id'], DataCheckEntityType.BIOBANK, "Missing head person name ('head_firstname' and/or 'head_lastname' attributes are empty)" )) if not 'head_role' in biobank or re.search('^\s*$', biobank['head_role']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.INFO, biobank['id'], DataCheckEntityType.BIOBANK, "Missing head person role ('head_role' attribute is empty)" )) if 'contact' not in biobank or type( biobank['contact']) is not dict: warnings.append( DataCheckWarning(self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Missing valid contact for the biobank")) return warnings
def check(self, dir, args): warnings = [] log.info("Running collection existence checks (CollectionExistence)") for biobank in dir.getBiobanks(): collections = dir.getGraphBiobankCollectionsFromBiobank( biobank['id']) if len(collections.edges) < 1: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Missing at least one collection for biobank")) return warnings
def check(self, dir, args): warnings = [] log.info("Running orphaned collection checks (OrphanedCollections)") for collection in dir.getCollections(): collections = dir.getGraphBiobankCollectionsFromCollection( collection['id']) if len(collections.edges) < 1: warnings.append( DataCheckWarning(self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Orphaned collection")) return warnings
def check(self, dir, args): warnings = [] log.info("Running empty or semi-empty fields checks (SemiemptyFields)") for biobank in dir.getBiobanks(): if not 'description' in biobank or re.search('^\s*$', biobank['description']) or re.search('^\s*N/?A\s*$', biobank['description']): warnings.append(DataCheckWarning(self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.WARNING, biobank['id'], DataCheckEntityType.BIOBANK, "Missing description for biobank ('description' attribute is empty for the biobank)")) if 'description' in biobank and descriptionTooShort(biobank['description']): warnings.append(DataCheckWarning(self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.WARNING, biobank['id'], DataCheckEntityType.BIOBANK, f"Suspiciously short description for biobank ('description' attribute {biobank['description']} has less than {str(minDescWords)} words)")) if not 'name' in biobank or re.search('^\s*$', biobank['name']) or re.search('^\s*N/?A\s*$', biobank['name']): warnings.append(DataCheckWarning(self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Missing name for biobank ('name' attribute is empty for the biobank)")) for collection in dir.getCollections(): if not 'description' in collection or re.search('^\s*$', collection['description']) or re.search('^\s*N/?A\s*$', collection['description']): warnings.append(DataCheckWarning(self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "Missing description for collection ('description' attribute is empty for the collection)")) if 'description' in collection and descriptionTooShort(collection['description']): warnings.append(DataCheckWarning(self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, f"Suspiciously short description for collection ('description' attribute {collection['description']} has less than {str(minDescWords)} words)")) if not 'name' in collection or re.search('^\s*$', collection['name']) or re.search('^\s*N/?A\s*$', collection['name']): warnings.append(DataCheckWarning(self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Missing name for collection ('name' attribute is empty for the biobank)")) return warnings
def check(self, dir, args): warnings = [] log.info("Running URL checks (CheckURLs)") assert 'URLs' in __main__.remoteCheckList if 'URLs' in args.disableChecksRemote: return warnings cache_dir = 'data-check-cache/URLs' if not os.path.exists(cache_dir): os.makedirs(cache_dir) global cache cache = Cache(cache_dir) if 'URLs' in args.purgeCaches: cache.clear() log.info("Testing biobank URLs") for biobank in dir.getBiobanks(): if not 'url' in biobank or re.search('^\s*$', biobank['url']): warnings.append( DataCheckWarning(self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.WARNING, biobank['id'], DataCheckEntityType.BIOBANK, "Missing URL")) else: URLwarnings = testURL( biobank['url'], DataCheckWarning(self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Biobank URL")) warnings += URLwarnings log.info("Testing collection URLs") for collection in dir.getBiobanks(): # non-existence of access URIs is tested in the access policy checks - here we only check validity of the URL if it exists if 'data_access_uri' in collection and not re.search( '^\s*$', collection['data_access_uri']): URLwarnings = testURL( collection['data_access_uri'], DataCheckWarning(self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Data access URL for collection")) warnings += URLwarnings if 'sample_access_uri' in collection and not re.search( '^\s*$', collection['sample_access_uri']): URLwarnings = testURL( collection['sample_access_uri'], DataCheckWarning(self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Sample access URL for collection")) warnings += URLwarnings if 'image_access_uri' in collection and not re.search( '^\s*$', collection['image_access_uri']): URLwarnings = testURL( collection['image_access_uri'], DataCheckWarning(self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Image access URL for collection")) warnings += URLwarnings cache.close() return warnings
def check(self, dir, args): warnings = [] log.info("Running geographical location checks (BiobankGeo)") # This is to be enabled for real runs. assert 'geocoding' in __main__.remoteCheckList if 'geocoding' in args.disableChecksRemote: geoCodingEnabled = False else: geoCodingEnabled = True cache_dir = 'data-check-cache/geolocator' if not os.path.exists(cache_dir): os.makedirs(cache_dir) cache = Cache(cache_dir) if 'geocoding' in args.purgeCaches: cache.clear() geocoords_pattern = '^-?\d+\.\d+$' geolocator = Nominatim( user_agent= 'Mozilla/5.0 (X11; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0', timeout=15) for biobank in dir.getBiobanks(): if 'latitude' in biobank and not re.search( '^\s*$', biobank['latitude'] ) and 'longitude' in biobank and not re.search( '^\s*$', biobank['longitude']): # we check before doing any convenience substitutions if not re.search(geocoords_pattern, biobank['latitude']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Invalid biobank latitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '" + biobank['latitude'] + "'")) if not re.search(geocoords_pattern, biobank['longitude']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Invalid biobank longitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '" + biobank['longitude'] + "'")) # this is for convenience - if there are commas used instead of periods, we should still do the remaining checks biobank['latitude'] = re.sub(r',', r'.', biobank['latitude']) biobank['longitude'] = re.sub(r',', r'.', biobank['longitude']) if re.search(geocoords_pattern, biobank['latitude']) and re.search( geocoords_pattern, biobank['longitude']): if geoCodingEnabled: logMessage = "Checking reverse geocoding for " + biobank[ 'latitude'] + ", " + biobank['longitude'] try: loc_string = biobank['latitude'] + ", " + biobank[ 'longitude'] if loc_string in cache and cache[loc_string] != "": country_code = cache[loc_string] else: location = geolocator.reverse(loc_string, language='en') country_code = location.raw['address'][ 'country_code'] cache[loc_string] = country_code logMessage += " -> OK" if ((biobank['country']['id'] != "IARC" and biobank['country']['id'] != "EU") and country_code.upper() != biobank['country']['id'] and not (country_code.upper() == "GB" and biobank['country']['id'] == "UK")): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.WARNING, biobank['id'], DataCheckEntityType.BIOBANK, "Geolocation of the biobank is likely outside of its country " + biobank['country']['id'] + "; biobank seems to be in " + country_code.upper() + f" based on geographical coordinates 'latitude'={biobank['latitude']} 'longitude'={biobank['longitude']}" )) except Exception as e: logMessage += " -> failed (" + str(e) + ")" warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.WARNING, biobank['id'], DataCheckEntityType.BIOBANK, "Reverse geocoding of the biobank location failed (" + str(e) + ")")) log.info(logMessage) else: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.INFO, biobank['id'], DataCheckEntityType.BIOBANK, "Missing geographical coordinates ('latitude and/or 'longitude' attributes are empty)" )) for collection in dir.getCollections(): if 'latitude' in collection and not re.search( '^\s*$', collection['latitude'] ) and 'longitude' in collection and not re.search( '^\s*$', collection['longitude']): # we check before doing any convenience substitutions if not re.search(geocoords_pattern, collection['latitude']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Invalid collection latitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '" + collection['latitude'] + "'")) if not re.search(geocoords_pattern, collection['longitude']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Invalid collection longitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '" + collection['longitude'] + "'")) # this is for convenience - if there are commas used instead of periods, we should still do the remaining checks collection['latitude'] = re.sub(r',', r'.', collection['latitude']) collection['longitude'] = re.sub(r',', r'.', collection['longitude']) if re.search(geocoords_pattern, collection['latitude']) and re.search( geocoords_pattern, collection['longitude']): if geoCodingEnabled: logMessage = "Checking reverse geocoding for " + collection[ 'latitude'] + ", " + collection['longitude'] try: loc_string = collection[ 'latitude'] + ", " + collection['longitude'] if loc_string in cache and cache[loc_string] != "": country_code = cache[loc_string] else: location = geolocator.reverse(loc_string, language='en') country_code = location.raw['address'][ 'country_code'] cache[loc_string] = country_code logMessage += " -> OK" biobankId = dir.getCollectionBiobankId( collection['id']) biobank = dir.getBiobankById(biobankId) if ((biobank['country']['id'] != "IARC" and biobank['country']['id'] != "EU") and country_code.upper() != biobank['country']['id'] and not (country_code.upper() == "GB" and biobank['country']['id'] == "UK")): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "Geolocation of the collection is likely outside of its country " + collection['country']['id'] + "; collection seems to be in " + country_code.upper() + f" based on geographical coordinates 'latitude'={collection['latitude']} 'longitude'={collection['longitude']}" )) except Exception as e: logMessage += " -> failed (" + str(e) + ")" warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "Reverse geocoding of the collection location failed (" + str(e) + ")")) log.info(logMessage) cache.close() return warnings
def check(self, dir, args): warnings = [] log.info("Running collection content checks (CollectionContent)") orphacodes = dir.getOrphaCodesMapper() for collection in dir.getCollections(): OoM = collection['order_of_magnitude']['id'] materials = Directory.getListOfEntityAttributeIds( collection, 'materials') data_categories = Directory.getListOfEntityAttributeIds( collection, 'data_categories') types = Directory.getListOfEntityAttributeIds(collection, 'type') diags = [] diags_icd10 = [] diags_orpha = [] if 'diagnosis_available' in collection: diag_ranges = [] for d in collection['diagnosis_available']: diags.append(d['id']) if re.search('-', d['id']): diag_ranges.append(d['id']) if re.search('^urn:miriam:icd:', d['id']): diags_icd10.append( re.sub('^urn:miriam:icd:', '', d['id'])) elif re.search('^ORPHA:', d['id']): if dir.issetOrphaCodesMapper(): if orphacodes.isValidOrphaCode(d): diags_orpha.append( re.sub('^ORPHA:', '', d['id'])) else: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Invalid ORPHA code found: %s" % (d['id']))) if diag_ranges: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "It seems that diagnoses contains range - this will render the diagnosis search ineffective for the given collection. Violating diagnosis term(s): " + '; '.join(diag_ranges))) if len(types) < 1: warnings.append( DataCheckWarning(self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Collection type not provided")) if 'size' in collection and isinstance(collection['size'], int): if OoM > 1 and collection['size'] < 10**OoM or collection[ 'size'] > 10**(OoM + 1): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Size of the collection does not match its order of magnitude: size = " + str(collection['size']) + ", order of magnitude is %d (size between %d and %d)" % (OoM, 10**OoM, 10**(OoM + 1)))) if OoM > 4: subCollections = dir.getCollectionsDescendants( collection['id']) if len(subCollections) < 1: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.INFO, collection['id'], DataCheckEntityType.COLLECTION, "Suspicious situation: large collection (> 100,000 samples or cases) without subcollections; unless it is a really homogeneous collection, it is advisable to refine such a collection into sub-collections to give users better insight into what is stored there" )) if OoM > 5: if (not 'size' in collection.keys()) or (collection['size'] == 0): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.INFO, collection['id'], DataCheckEntityType.COLLECTION, "Suspicious situation: large collection (> 1,000,000 samples or cases) without exact size specified" )) if any(x in types for x in ['HOSPITAL', 'DISEASE_SPECIFIC', 'RD' ]) and len(diags) < 1: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "No diagnoses provide for HOSPITAL or DISEASE_SPECIFIC or RD collection" )) if len(diags) > 0 and not any( x in types for x in ['HOSPITAL', 'DISEASE_SPECIFIC', 'RD']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.INFO, collection['id'], DataCheckEntityType.COLLECTION, "Diagnoses provided but none of HOSPITAL, DISEASE_SPECIFIC, RD is specified as collection type (this may be easily false positive check)" )) if 'BIOLOGICAL_SAMPLES' in data_categories and len(materials) == 0: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "No material types are provided while biological samples are collected" )) if len(materials ) > 0 and 'BIOLOGICAL_SAMPLES' not in data_categories: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Sample types advertised but BIOLOGICAL_SAMPLES missing among its data categories" )) if 'MEDICAL_RECORDS' in data_categories and len(diags) < 1: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "No diagnoses provide for a collection with MEDICAL_RECORDS among its data categories" )) if len(diags) > 0 and 'MEDICAL_RECORDS' not in data_categories: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "Diagnoses provided but no MEDICAL_RECORDS among its data categories" )) if 'RD' in types and len(diags_orpha) == 0: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "Rare disease (RD) collection without ORPHA code diagnoses" )) if dir.issetOrphaCodesMapper(): for d in diags_icd10: orpha = orphacodes.icd10ToOrpha(d) if orpha is not None and len(orpha) > 0: orphalist = [ "%(code)s(%(name)s)/%(mapping_type)s" % { 'code': c['code'], 'name': orphacodes.orphaToNamesString(c['code']), 'mapping_type': c['mapping_type'] } for c in orpha ] warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.INFO, collection['id'], DataCheckEntityType.COLLECTION, "Consider adding following ORPHA code(s) to the RD collection - based on mapping ICD-10 code %s to ORPHA codes: %s" % (d, ",".join(orphalist)))) if len(diags_orpha) > 0 and 'RD' not in types: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "ORPHA code diagnoses provided, but collection not marked as rare disease (RD) collection" )) if len(diags_orpha) > 0 and len(diags_icd10) == 0: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "ORPHA code diagnoses specified, but no ICD-10 equivalents provided, thus making collection impossible to find for users using ICD-10 codes" )) if len(diags_orpha) > 0 and dir.issetOrphaCodesMapper(): for d in diags_orpha: icd10codes = orphacodes.orphaToIcd10(d) for c in icd10codes: if 'urn:miriam:icd:' + c['code'] not in diags_icd10: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.INFO, collection['id'], DataCheckEntityType.COLLECTION, "ORPHA code %s provided, but its translation to ICD-10 as %s is not provided (mapping is of %s type). It is recommended to provide this translation explicitly until Directory implements full semantic mapping search." % (d, c['code'], c['mapping_type']))) modalities = [] if 'imaging_modality' in collection: for m in collection['imaging_modality']: modalities.append(m['id']) image_dataset_types = [] if 'image_dataset_type' in collection: for idt in collection['image_dataset_type']: image_dataset_types.append(idt['id']) if 'IMAGING_DATA' in data_categories: if len(modalities) < 1: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "No image modalities provided for image collection" )) if len(image_dataset_types) < 1: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "No image dataset types provided for image collection" )) if (len(modalities) > 0 or len(image_dataset_types) > 0 ) and 'IMAGING_DATA' not in data_categories: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Imaging modalities or image data set found, but IMAGING_DATA is not among data categories: image_modality = %s, image_dataset_type = %s" % (modalities, image_dataset_types))) age_unit = None if 'age_unit' in collection: age_units = collection['age_unit'] if len(age_units) > 1: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Ambiguous speification of age_unit - only one value is permitted. Provided values %s" % (age_units))) elif len(age_units) == 1: age_unit = age_units[0] if ('age_high' in collection or 'age_low' in collection) and ( 'age_low' not in collection or len(age_units) < 1): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, f"Missing age_unit for provided age range: {collection.get('age_low')}-{collection.get('age_high')}" )) age_min_limit = -1 if age_unit == "MONTH": age_min_limit = age_min_limit * 12 elif age_unit == "WEEK": age_min_limit = age_min_limit * 52.1775 elif age_unit == "DAY": age_min_limit = age_min_limit * 365.2 if ('age_high' in collection and collection['age_high'] < age_min_limit): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Age_high is below the minimum value limit (%d %s): offending value %d" % (age_min_limit, age_unit, collection['age_high']))) if ('age_low' in collection and collection['age_low'] < age_min_limit): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Age_low is below the minimum value limit (%d %s): offending value %d" % (age_min_limit, age_unit, collection['age_low']))) if ('age_high' in collection and 'age_low' in collection): if (collection['age_low'] > collection['age_high']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Age_low (%d) is higher than age_high (%d)" % (collection['age_low'], collection['age_high']))) elif (collection['age_low'] == collection['age_high']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.INFO, collection['id'], DataCheckEntityType.COLLECTION, "Suspect situation: age_low == age_high == (%d) (may be false positive)" % (collection['age_low']))) return warnings
def check(self, dir, args): warnings = [] log.info("Running contact fields checks (ContactFields)") ValidateEmails = True assert 'emails' in __main__.remoteCheckList if 'emails' in args.disableChecksRemote: ValidateEmails = False else: ValidateEmails = True cache_dir = 'data-check-cache/emails' if not os.path.exists(cache_dir): os.makedirs(cache_dir) cache = Cache(cache_dir) if 'emails' in args.purgeCaches: cache.clear() for contact in dir.getContacts(): if (not 'first_name' in contact or re.search('^\s*$', contact['first_name'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.WARNING, contact['id'], DataCheckEntityType.CONTACT, "Missing first name for contact ('first_name' attribute is empty)" )) if (not 'last_name' in contact or re.search('^\s*$', contact['last_name'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.WARNING, contact['id'], DataCheckEntityType.CONTACT, "Missing last name for contact ('last_name' attribute is empty)" )) if (not 'email' in contact or re.search('^\s*$', contact['email'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.ERROR, contact['id'], DataCheckEntityType.CONTACT, "Missing email for contact ('email' attribute is empty)" )) elif (not validate_email(contact['email'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.WARNING, contact['id'], DataCheckEntityType.CONTACT, "Email for contact is invalid - offending 'email' attribute value: " + contact['email'])) else: # This is pretty dramatic test and should be used sparingly if ValidateEmails: contact_email = contact['email'] log_message = "Validating email " + contact_email # XXX: does not work in most cases #if(not validate_email(contact['email'],verify=True)): try: if (contact_email in cache): cache_result = cache[contact_email] if (cache_result['valid']): log_message += " -> OK" else: log_message += " -> failed" warnings.append(cache_result['warning']) else: if (not validate_email(contact_email, check_mx=True)): log_message += " -> failed" warning = DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.WARNING, contact['id'], DataCheckEntityType.CONTACT, "Email for contact seems to be unreachable because of missing DNS MX record" ) warnings.append(warning) cache[contact_email] = { 'valid': False, 'warning': warning } else: log_message += " -> OK" cache[contact_email] = { 'valid': True, 'warning': None } log.info(log_message) except (DNS.Base.TimeoutError, DNS.Base.ServerError, DNS.Base.SocketError) as e: log_message += " -> failed with exception (" + str( e) + ")" log.error(log_message) if (not 'phone' in contact or re.search('^\s*$', contact['phone'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.WARNING, contact['id'], DataCheckEntityType.CONTACT, "Missing phone for contact ('phone' attribute is empty'" )) elif (not re.search('^\+(?:[0-9]??){6,14}[0-9]$', contact['phone'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.ERROR, contact['id'], DataCheckEntityType.CONTACT, "Phone number for contact does not conform to the E.123 international standard (means starts with + sign, no spaces) - offending phone number in 'phone' attribute: " + contact['phone'])) return warnings
def check(self, dir, args): warnings = [] log.info("Running identifier validation checks (ValidateIDs)") for biobank in dir.getBiobanks(): NN = dir.getBiobankNN(biobank['id']) if NN not in NNContacts.NNtoEmails: if not re.search('^bbmri-eric:ID:EXT_', biobank['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "BiobankID is not compliant with the specification " + ' (shall start with "bbmri-eric:ID:EXT_" prefix for external biobanks that have no national node)' )) if re.search('^bbmri-eric:ID:EXT', biobank['id']): if not re.search('^bbmri-eric:ID:EXT_', biobank['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "BiobankID is not compliant with the specification " + ' (shall start with "bbmri-eric:ID:EXT_" prefix for external biobanks)' )) else: if not re.search('^bbmri-eric:ID:' + NN + '_', biobank['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "BiobankID is not compliant with the specification " + ' (shall start with "bbmri-eric:ID:' + NN + '_' + '" prefix)')) if re.search('[^A-Za-z0-9:_-]', biobank['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "BiobankID contains illegal characters " + ' (shall be "A-Za-z0-9:_-")')) if re.search('::', biobank['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "BiobankID contains :: indicating empty component in ID hierarchy" )) for collection in dir.getCollections(): NN = dir.getCollectionNN(collection['id']) if NN not in NNContacts.NNtoEmails: if not re.search('^bbmri-eric:ID:EXT_', collection['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "CollectionID is not compliant with the specification " + ' (shall start with "bbmri-eric:ID:EXT_" prefix for collections from external biobanks that have no national node)' )) if re.search('^bbmri-eric:ID:EXT', collection['id']): if not re.search('^bbmri-eric:ID:EXT_', collection['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "CollectionID is not compliant with the specification " + ' (shall start with "bbmri-eric:ID:EXT_" prefix for collections from external biobanks)' )) else: if not re.search('^bbmri-eric:ID:' + NN + '_', collection['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "CollectionID is not compliant with the specification " + ' (shall start with "bbmri-eric:ID:' + NN + '_' + '" prefix)')) if re.search('[^A-Za-z0-9:_-]', collection['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "CollectionID contains illegal characters " + ' (shall be "A-Za-z0-9:_-")')) biobankID = collection['biobank']['id'] if not re.search('^' + biobankID + ':collection:', collection['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "CollectionID does not contain expected biobank prefix " + ' (should start with ' + biobankID + ':collection:' + ')')) if re.search('::', collection['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "CollectionID contains :: indicating empty component in ID hierarchy" )) for contact in dir.getContacts(): NN = dir.getContactNN(contact['id']) if NN not in NNContacts.NNtoEmails: if not re.search('^bbmri-eric:ID:EXT_', contact['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, contact['id'], DataCheckEntityType.CONTACT, "ContactID is not compliant with the specification " + ' (shall start with "bbmri-eric:ID:EXT_" prefix for contacts for external biobanks that have no national node)' )) if re.search('^bbmri-eric:contactID:EXT', contact['id']): if not re.search('^bbmri-eric:contactID:EXT_', contact['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, contact['id'], DataCheckEntityType.CONTACT, "ContactID is not compliant with the specification " + ' (shall start with "bbmri-eric:contactID:EXT_" prefix for contacts for external biobanks)' )) else: if not re.search('^bbmri-eric:contactID:' + NN + '_', contact['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, contact['id'], DataCheckEntityType.CONTACT, "ContactID is not compliant with the specification " + ' (shall start with "bbmri-eric:contactID:' + NN + '_' + '" prefix)')) if re.search('[^A-Za-z0-9:_-]', contact['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, contact['id'], DataCheckEntityType.CONTACT, "ContactID contains illegal characters " + ' (shall be "A-Za-z0-9:_-")')) if re.search('::', contact['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, contact['id'], DataCheckEntityType.CONTACT, "ContactID contains :: indicating empty component in ID hierarchy" )) for network in dir.getNetworks(): NN = dir.getNetworkNN(network['id']) if NN not in NNContacts.NNtoEmails: if not re.search('^bbmri-eric:ID:EXT_', network['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, network['id'], DataCheckEntityType.NETWORK, "NetworkID is not compliant with the specification " + ' (shall start with "bbmri-eric:ID:EXT_" prefix for networks from countries that have no national node)' )) if not re.search('^bbmri-eric:networkID:', network['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, network['id'], DataCheckEntityType.NETWORK, "NetworkID is not compliant with the specification " + ' (shall start with "bbmri-eric:networkID: prefix)')) else: if not re.search('^bbmri-eric:networkID:' + NN + '_', network['id']) and not re.search( '^bbmri-eric:networkID:EU_', network['id']) and not re.search( '^bbmri-eric:networkID:EXT_', network['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.WARNING, network['id'], DataCheckEntityType.NETWORK, "NetworkID has suspicious country affiliation " + ' (should start with "bbmri-eric:networkID:' + NN + '_' + '" or "bbmri-eric:networkID:EU_" prefix)')) if re.search('[^A-Za-z0-9:_-]', network['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, network['id'], DataCheckEntityType.NETWORK, "NetworkID contains illegal characters " + ' (shall be "A-Za-z0-9:_-")')) if re.search('::', network['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", NN, DataCheckWarningLevel.ERROR, network['id'], DataCheckEntityType.NETWORK, "NetworkID contains :: indicating empty component in ID hierarchy" )) return warnings
def check(self, dir, args): warnings = [] log.info("Running COVID content checks (COVID)") biobankHasCovidCollection = {} biobankHasCovidProspectiveCollection = {} biobankHasCovidControls = {} for collection in dir.getCollections(): biobankId = dir.getCollectionBiobankId(collection['id']) biobank = dir.getBiobankById(biobankId) biobank_capabilities = [] if 'capabilities' in biobank: for c in biobank['capabilities']: biobank_capabilities.append(c['id']) biobank_covid = [] if 'covid19biobank' in biobank: for c in biobank['covid19biobank']: biobank_covid.append(c['id']) biobank_networks = [] if 'network' in biobank: for n in biobank['network']: biobank_networks.append(n['id']) OoM = collection['order_of_magnitude']['id'] materials = [] if 'materials' in collection: for m in collection['materials']: materials.append(m['id']) data_categories = [] if 'data_categories' in collection: for c in collection['data_categories']: data_categories.append(c['id']) types = [] if 'type' in collection: for t in collection['type']: types.append(t['id']) diags = [] diag_ranges = [] covid_diag = False covid_control = False for d in collection['diagnosis_available']: if re.search('-', d['id']): diag_ranges.append(d['id']) else: diags.append(d['id']) for d in diags + diag_ranges: # ICD-10 if re.search('U07', d): covid_diag = True # ICD-10 if re.search('Z03.818', d): covid_control = True # ICD-11 if re.search('RA01', d): covid_diag = True # SNOMED CT if re.search( '(840533007|840534001|840535000|840536004|840539006|840544004|840546002)', d): covid_diag = True if covid_diag: biobankHasCovidCollection[biobank['id']] = True else: # just initialize the record if not yet set at all - otherwise don't touch! if not biobank['id'] in biobankHasCovidCollection: biobankHasCovidCollection[biobank['id']] = False if covid_control: biobankHasCovidControls[biobank['id']] = True else: # just initialize the record if not yet set at all - otherwise don't touch! if not biobank['id'] in biobankHasCovidControls: biobankHasCovidControls[biobank['id']] = False if (covid_diag or covid_control) and diag_ranges: warning = DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "It seems that diagnoses contains range - this will render the diagnosis search ineffective for the given collection. Violating diagnosis term(s): " + '; '.join(diag_ranges)) warnings.append(warning) if covid_diag or covid_control: if not covidNetworkName in biobank_networks: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Biobank contains COVID collection " + collection['id'] + ' but not marked as part of ' + covidNetworkName)) if not 'covid19' in biobank_covid: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Biobank contains COVID collection " + collection['id'] + ' but does not have "covid19" attribute in "covid19biobank" section of attributes' )) if len(types) < 1: warnings.append( DataCheckWarning(self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Collection type not provided")) if re.search(covidProspectiveCollectionIdPattern, collection['id']): biobankHasCovidProspectiveCollection[biobank['id']] = True if not 'DISEASE_SPECIFIC' in types: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Prospective COVID-19 collections must have DISEASE_SPECIFIC as one of its types" )) if not 'PROSPECTIVE_COLLECTION' in types: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Prospective COVID-19 collections must have PROSPECTIVE_COLLECTION as one of its types" )) if OoM > 0: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "Prospective collection type represents capability of setting up prospective collections - hence it should have zero order of magnitude" )) if not covid_diag and not covid_control: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "COVID19PROSPECTIVE collection misses COVID-19 diagnosis or COVID-19 controls filled in" )) if re.search('^Ability to collect', collection['name']) and (covid_diag or covid_control): if not re.search(covidProspectiveCollectionIdPattern, collection['id']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, 'Collection having "ability to collect" does not have COVID19PROSPECTIVE label' )) # only report the following if it hasn't been reported above (hence only if the COVID19PROSPECTIVE does not match) if OoM > 0: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "Prospective collection type represents capability of setting up prospective collections - hence it should have zero order of magnitude" )) # also find other prospective collections containing COVID-19 if not re.search( covidProspectiveCollectionIdPattern, collection['id'] ) and covid_diag and 'PROSPECTIVE_COLLECTION' in types: biobankHasCovidProspectiveCollection[biobank['id']] = True log.debug( "Prospective COVID-19 collection found with non-standard identifier: %s (%s) in biobank %s (%s)" % (collection['id'], collection['name'], biobank['id'], biobank['name'])) if re.search('.*:COVID19$', collection['id']): if not 'DISEASE_SPECIFIC' in types: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Existing COVID-19 collections must have DISEASE_SPECIFIC as one of its types" )) if not 'DNA' in materials and not 'PATHOGEN' in materials and not 'PERIPHERAL_BLOOD_CELLS' in materials and not 'PLASMA' in materials and not 'RNA' in materials and not 'SALIVA' in materials and not 'SERUM' in materials and not 'WHOLE_BLOOD' in materials and not 'FECES' in materials and not 'BUFFY_COAT' in materials and not 'NASAL_SWAB' in materials and not 'THROAT_SWAB' in materials: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "Supect material types: existing COVID-19 collection does not have any of the common material types: DNA, PATHOGEN, PERIPHERAL_BLOOD_CELLS, PLASMA, RNA, SALIVA, SERUM, WHOLE_BLOOD, FECES, BUFFY_COAT, NASAL_SWAB, THROAT_SWAB" )) if 'NASAL_SWAB' in materials or 'THROAT_SWAB' in materials or 'FECES' in materials and not ( 'BSL2' in biobank_covid or 'BSL3' in biobank_covid): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "Suspect situation: collection contains infectious material (nasal/throat swabs, faeces) while the parent biobank does not indicate BSL2 nor BSL3 available" )) if not covid_diag: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "COVID19 collection misses COVID-19 diagnosis filled in" )) for biobank in dir.getBiobanks(): biobank_capabilities = [] if 'capabilities' in biobank: for c in biobank['capabilities']: biobank_capabilities.append(c['id']) biobank_covid = [] if 'covid19biobank' in biobank: for c in biobank['covid19biobank']: biobank_covid.append(c['id']) biobank_networks = [] if 'network' in biobank: for n in biobank['network']: biobank_networks.append(n['id']) if covidNetworkName in biobank_networks and not 'covid19' in biobank_covid: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Biobank is part of " + covidNetworkName + " but does not have covid19 among covid19biobank attributes" )) if 'covid19' in biobank_covid and not covidNetworkName in biobank_networks: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Biobank has covid19 among covid19biobank attributes but is not part of " + covidNetworkName)) # This is a simple check if the biobank has other services than just the attribute of being a covid19 biobank other_covid_services = False for s in biobank_covid: if s != 'covid19': other_covid_services = True if 'covid19' in biobank_covid and not ( biobank['id'] in biobankHasCovidCollection or biobank['id'] in biobankHasCovidControls or other_covid_services): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Biobank has covid19 among covid19biobank but has no relevant services nor any collection of COVID-19 samples nor any collection of COVID-19 controls" )) if 'ProspectiveCollections' in biobank_covid and not biobank[ 'id'] in biobankHasCovidProspectiveCollection: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.WARNING, biobank['id'], DataCheckEntityType.BIOBANK, "Biobank has ProspectiveCollections among covid19biobank attributes but has no prospective collection defined (collection ID matching '" + covidProspectiveCollectionIdPattern + "' regex pattern)")) if biobank[ 'id'] in biobankHasCovidProspectiveCollection and not 'ProspectiveCollections' in biobank_covid: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Biobank has prospective COVID-19 collection defined but ProspectiveCollections is not among covid19biobank attributes" )) return warnings