def inspect(self, texts, include_quote=True, min_level=None): '''inspect will inspect a dump of text for identifiers :param include_quote: include quotes in the query? :param min_level: the minimum likilihood level to return ''' if not isinstance(texts, list): texts = [texts] if min_level is None: min_level = 'LIKELIHOOD_UNSPECIFIED' config = { 'includeQuote': include_quote, 'infoTypes': [], 'maxFindings': 0, 'minLikelihood': min_level } items = [] for text in texts: new_item = {'type': 'text/plain', 'value': text} items.append(new_item) groups = paginate_items(items, size=100) results = [] for idx in range(len(groups)): bot.debug("inspecting for %s of %s" % (idx + 1, len(groups))) items = groups[idx] body = {'inspectConfig': config, 'items': items} result = self.service.content().inspect(body=body).execute() results = results + result['results'] return results
def validate_metadata(full_path, metadata_type=None): '''validate_metadata checks to see if a name (either a collection name, folder for an image or text) has associated metadata, indicated by a file of the same name (ending with json) in the parent directory of the named file. If no matching files are found, None is returned, and the user is alerted. If a matching file is found, it is checked to be valid json. :param full_path: full path to a file or folder :param metadata_type: either one of collection, image, or text. Default collection ''' if metadata_type == None: metadata_type = "collection" parent_dir = os.path.dirname(full_path) base_name = os.path.basename(full_path).split('.')[0] metadata = "%s/%s.json" % (parent_dir, base_name) if os.path.exists(metadata): bot.debug('found %s metadata: %s' % (metadata_type, base_name)) try: md = read_json(metadata) bot.info('%s %s metadata is valid' % (metadata_type, base_name)) except: bot.error('%s %s has invalid json metadata %s' % (metadata_type, base_name, metadata)) return False else: bot.info('%s %s does not have metadata file %s.json' % (metadata_type, base_name, base_name)) return None return True
def delete(self): '''delete an entity''' key = None if self._Entity is not None: key = self._Entity.key self.client.delete(key) bot.debug("Deleting %s" % (key)) return key
def get(self, url, headers=None, token=None, data=None, return_json=True): '''get will use requests to get a particular url ''' bot.debug("GET %s" % url) return self.call(url, func=requests.get, data=data, return_json=return_json)
def post(self, url, data=None, return_json=True): '''post will use requests to get a particular url ''' bot.debug("POST %s" % url) return self.call(url, func=requests.post, data=data, return_json=return_json)
def put(self, url, data=None, return_json=True): '''put request ''' bot.debug("PUT %s" % url) return self.call(url, func=requests.put, data=data, return_json=return_json)
def get_structures(inputs, build_dir=None, clean_up=True, fail_exit=True): '''get structures will parse one or more compressed files and/ or folder paths and return a data structure that has full file paths for images/text documents, and the loaded json for metadata. :param inputs: a single, or list of inputs, meaning folders and compressed files for validation :param build_dir: a directory to use to extract and run things. If not specified, one is created. :param clean_up: boolean to determine if test_dir and subcontents should be removed after tests. Default is True. :param fail_exit: Given failure of validation, fail the process. Otherwise, return False to the calling function. Default fail_exit is True ''' if not isinstance(inputs, list): inputs = [inputs] bot.debug("Found %s inputs to structure using som-validator." % len(inputs)) # Where are we testing? if build_dir == None: build_dir = tempfile.mkdtemp() # We will return a list of structures, each a collection structures = dict() # Tell the user about testing folder message = "Building folder will be %s" if clean_up == True: message = "%s, and will be removed upon completion." % (message) bot.debug(message % build_dir) for testing in inputs: valid = validate_dataset(dataset=testing, testing_base=build_dir, clean_up=clean_up) # We only structure input that is valid if valid == False: if fail_exit == True: bot.error("Input %s is not valid, please fix. Exiting." % testing) sys.exit(1) bot.error("Input %s is not valid, skipping.") else: structures[dataset] = structure_dataset(dataset=testing, testing_base=build_dir, clean_up=clean_up) return structures
def update_fields(self, new_fields, add_new=True): '''update fields will update the model's fields with an input dictionary new fields are added if add_new=True (default). This does not by default save the entity. ''' for key, value in new_fields.items(): if key not in self._Entity.keys(): bot.debug("adding %s to Entity" % (key)) self._Entity[key] = value else: if add_new == True: bot.debug("%s found existing in Entity, overwriting" % (key)) self._Entity[key] = value
def run_validation(inputs, test_dir=None, clean_up=True, fail_exit=True): '''run validation will run one or more inputs through the validation procedure, meaning checking that the folder (and other data structures) fit the WordFish standard: http://www.github.com/radinformatics/wordfish-standard :param inputs: a single, or list of inputs, meaning folders and compressed files for validation :param test_dir: a directory to use to extract and run things. If not specified, one is created. :param clean_up: boolean to determine if test_dir and subcontents should be removed after tests. Default is True. :param fail_exit: Given failure of validation, fail the process. Otherwise, return False to the calling function. Default fail_exit is True ''' if not isinstance(inputs, list): inputs = [inputs] bot.debug("Found %s inputs to test using som-validator." % len(inputs)) # Where are we testing? if test_dir == None: test_dir = tempfile.mkdtemp() # Tell the user about testing folder message = "Testing folder will be %s" if clean_up == True: message = "%s, and will be removed upon completion." % (message) bot.debug(message % test_dir) for testing in inputs: if os.path.isdir(testing): valid = validate_folder(folder=testing) elif re.search("[.]zip$|[.]tar[.]gz$", testing): valid = validate_compressed(compressed_file=testing, testing_base=test_dir, clean_up=clean_up) # Always exit or return False if input is not valid if valid == False: if fail_exit == True: bot.error( "Input %s is not valid, please fix and retest. Exiting." % testing) sys.exit(1) bot.error( "Input %s is not valid, please fix and retest. Returning False." % testing) return valid return valid
def deidentify(self, ids, study=None): '''deidentify: uid endpoint: https://api.rit.stanford.edu/identifiers/api/v1/uid/{study} will take a list of identifiers, and return the deidentified. :param ids: a list of identifiers :param study: if None, defaults to test. ''' # Testing overrides all other specifications if study is None: study = self.study study = study.lower() bot.debug("study: %s" % study) url = "%s/%s/uid/%s" % (self.base, self.version, study) return self.post(url=url, data=ids)
def validate_item(item,sources=None,verbose=True): '''validate_item will validate a single item objects, intended to go in as a field to a POST. For more than one item, use validate_items wrapper :param item: the item object. Must include the following: :param sources: a list of valid item sources (eg ["pacs"]) :param verbose: if True, prints out valid True messages. False (errors) always printed :: notes { # generic attribute/values just to store, not interpreted "id":"123123123123123", // mandatory # the issuer for the above id # mandatory, or maybe optional with default of "stanford" or "unspecified" "id_source":"pacs", # when the id was looked up, to help with changed/merged ids # optional with default of current timestamp? "id_timestamp":"2016-01-30T17:15:23.123Z", # optional key/value attributes, will be stored as-is if provided, but not used or interpreted # values will be updated/replaced if sent multiple times (last received wins) # any existing values will be preserved if not explicitly set here; set empty string to remove "custom_fields":{ "image_type":"x-ray", "resolution":"high" } ''' if sources == None: sources = item_sources # These are the rules for an item rules = { "id": [Required, Pattern("^[A-Za-z0-9_-]*$")], # pattern "id_source": [Required, In(sources)], # must be in item sources "id_timestamp": [Required,Pattern(timestamp)], } valid,message = validate(rules, item) if verbose == True: bot.debug("identifier %s data structure valid: %s" %(item['id'],valid)) if valid == False: bot.error(message) if verbose == True: print(item) return valid
def upload_text(self, text, entity, batch=True, fields=None, permission=None, mimetype=None): '''upload_text will add a text object to the batch manager''' new_object = self.upload_object(file_path=text, entity=entity, fields=fields, mimetype=mimetype, permission=permission, object_type="Text", batch=batch) bot.debug('TEXT: %s' % new_object) return new_object
def find_dicoms(folder, extension=None): '''find_dicoms will walk a directory structure and find dicoms in subfolders :param folder: the parent folder to look in :param extension: the extension to use. Default is .dcm ''' folders = dict() if extension == None: extension = ".dcm" for dirpath, dirnames, filenames in os.walk(folder): dicoms = [] for filename in [f for f in filenames if f.endswith(extension)]: dicoms.append(os.path.join(dirpath, filename)) if len(dicoms) > 0: folders[dirpath] = dicoms bot.debug('Found %s directories with dicom.' % len(folders)) return folders
def upload_image(self, image, entity, batch=True, fields=None, permission=None, mimetype=None): '''upload_images will add an image object to the batch manager ''' new_object = self.upload_object(file_path=image, entity=entity, fields=fields, mimetype=mimetype, permission=permission, object_type="Image", batch=batch) bot.debug('IMAGE: %s' % new_object) return new_object
def update_headers(self, fields=None): '''get_headers will return a simple default header for a json post. This function will be adopted as needed. ''' if self.headers is None: headers = self._init_headers() else: headers = self.headers if self.token is not None: headers["Authorization"] = "Bearer %s" % (self.token) if fields is not None: for key, value in fields.items(): headers[key] = value header_names = ",".join(list(headers.keys())) bot.debug("Headers found: %s" % header_names) self.headers = headers
def detect_compressed(folder,compressed_types=None): '''detect compressed will return a list of files in some folder that are compressed, by default this means .zip or .tar.gz, but the called can specify a custom list :param folder: the folder base to use. :param compressed_types: a list of types to include, should be extensions in format like *.tar.gz, *.zip, etc. ''' compressed = [] if compressed_types == None: compressed_types = ["*.tar.gz",'*zip'] bot.debug("Searching for %s" %", ".join(compressed_types)) for filey in os.listdir(folder): for compressed_type in compressed_types: if fnmatch.fnmatch(filey, compressed_type): compressed.append("%s/%s" %(folder,filey)) bot.debug("Found %s compressed files in %s" %len(compressed),folder) return compressed
def add_tag(dicom,name,value): '''add tag will add a tag only if it's in the (active) DicomDictionary :param dicom: the pydicom.dataset Dataset (pydicom.read_file) :param name: the name of the field to add :param value: the value to set, if name is a valid tag ''' dicom_file = os.path.basename(dicom.filename) tag = get_tag(name) if name in tag: dicom.add_new(tag['tag'], tag['VR'], value) # dicom.data_element("PatientIdentityRemoved") # (0012, 0062) Patient Identity Removed CS: 'Yes' bot.debug("ADDITION %s to %s." %(dicom.data_element(name),dicom_file)) else: bot.error("%s is not a valid field to add. Skipping." %(name)) return dicom
def read_series(dicoms, return_nifti=True): '''read_series will read in a series of dicoms belonging to a group :param dicoms: a list of dicom files to parse, assumed in same series and equal size :param return_nifti: If True (default) will turn image as Nifti File ''' # Sort the dicoms dicoms.sort() # Get the size of the image params = sniff_header(dicoms[0]) xdim = params['xdim'] ydim = params['ydim'] windom_center = params['window_center'] bot.debug("First dicom found with dimension %s by %s, using as standard." % (xdim, ydim)) # Let's get ordering of images based on InstanceNumber ordered = dict() for d in range(len(dicoms)): ds = dicom.read_file(dicoms[d]) if ds.Rows == xdim and ds.Columns == ydim: ordered[int(ds.InstanceNumber)] = ds.pixel_array # Sort by order zdim = len(ordered) data = numpy.ndarray((xdim, ydim, zdim)) # Start at window center, then go back to zero index = 0 for key in sorted(ordered.keys()): data[:, :, index] = ordered[key] index += 1 if return_nifti == True: affine = numpy.diag((1, 1, 1, 0)) data = nibabel.Nifti1Image(data, affine=affine) return data
def validate_identifiers(identifiers,id_sources=None,item_sources=None,verbose=True): '''validate_identifiers will validate one or more identifier objects, intended to go in as a field to a POST :param identifiers: the identifiers object. :param verbose: verbose output for items :param identifier_sources: a list of one or more identifier sources. :param item_sources: a list of one or more item sources If either not defined, default to use standards :: notes { # mandatory key for uniquely identifying the person "id":"1234567-8", # the issuer for the above id # mandatory, or maybe optional with default of "stanford" or "unspecified" "id_source":"stanford", # when the id was looked up, to help with changed/merged ids # optional with default of current timestamp? "id_timestamp":"2016-01-30T17:15:23.123Z", # optional key/value attributes, will be stored as-is if provided, but not used or interpreted # values will be updated/replaced if sent multiple times (last received wins) # any existing values will be preserved if not explicitly set here; set empty string to remove "custom_fields":{ "first_name":"Joe", "last_name":"Smith", "dob":"1970-02-28" } ''' if id_sources == None: id_sources = identifier_sources # These are the rules for a person rules = { "id": [Required, Pattern("^[A-Za-z0-9_-]*$")], # pattern "id_source": [Required, In(id_sources)], # must be in person sources "id_timestamp": [Required,Pattern(timestamp)], } if not isinstance(identifiers,dict): bot.error("Identifiers data structure must be dictionary.") return False if "identifiers" not in identifiers: bot.error("identifiers key not found in data structure.") items = identifiers['identifiers'] if not isinstance(items,list): bot.error("Items in identifiers data structure must be list.") return False for item in items: valid,message = validate(rules, item) if valid == False: bot.error(message) return valid if "items" in item: validate_items(item['items'],sources=item_sources) bot.debug("Identifiers data structure valid: %s" %valid) return valid
def set_table(self,table, clear_rows=True): self.table = table if clear_rows is True: bot.debug("Clearing previously added rows. Set clear_rows to False to prevent this.") self.rows = [] return self.rows
def receive_identifiers(response): '''receive identifiers will validate reception of an identifiers response. This should be a list :param response: the response list of identifiers :: notes successful response: HTTP 200 [ {'jittered_timestamp': '2016-01-30T17:15:23.123Z', 'id': '12345678', 'suid': '103e', 'id_source': 'Stanford MRN', 'custom_fields': [ {'key': 'studySiteID', 'value': '78329'}], 'items': [ { 'id_source': 'GE PACS', 'jittered_timestamp': '2016-01-15T17:15:23.123Z', 'id': 'A654321', 'suid': '103e'} ]} ] ''' # These fields are expected, but not required. We will error # if any fields are present outside this scope expected_fields = [ 'items', 'id_source', 'jittered_timestamp', 'suid', 'id', 'custom_fields' ] if not isinstance(response, list): bot.error("Response must be a list") return False # These are the rules for each uidEntity rules = { "id": [Required, Pattern("^[A-Za-z0-9_-]*$")], # pattern "suid": [Required, Pattern("^[A-Za-z0-9_-]*$")], # the suid "id_source": [Required, In(identifier_sources)], # must be in identifer sources "jittered_timestamp": [Required, Pattern(timestamp)] } for item in response: # Validate required fields valid, message = validate(rules, item) if valid == False: bot.error(message) return valid # Validate fields returned in response if not validate_fields(expected_fields, item.keys()): return False # Validate items if "items" in item: if not receive_items(item['items']): return False bot.debug("Identifiers data structure valid: %s" % valid) return valid
def progress_download(collection_name, output_folder, suid, project, bucket_name, query_entity=True, filters=None): ''' show progress while downloading images for a Collection/[c]/Entity/study Parameters ========== collection_name: the name of the collection, typically an IRB number output_folder: the base directory to create a study folder in project: Google Cloud project name suid: an suid of interest to query (eg, if querying an Entity, you would use the suid of the patient, an Image would be an suid of the study SUID --> (coded accession#) query_entity: by default, we query the entity first, and then get images. to query the images (studies) set this to False. bucket_name: the name for the Google Storage Bucket (usually provided) filters: a list of tuples to apply to filter the query. Default is: [ ("entity_id","=", study) ] to retrieve all Image items that are equal to the study name Returns ======= path to newly created image file ''' if filters is None: if query_entity is True: filters = [ ("uid","=", suid) ] else: filters = [ ("AccessionNumber","=", suid) ] bot.info("Collecting available images...") try: storage_client = storage.Client() except DefaultCredentialsError: bot.error("We didn't detect your GOOGLE_APPLICATION_CREDENTIALS in the environment! Did you export the path?") sys.exit(1) except Forbidden: bot.error("The service account specified by GOOGLE_APPLICATION_CREDENTIALS does not have permission to use this resource.") sys.exit(1) if not os.path.exists(output_folder): os.mkdir(output_folder) bucket = storage_client.get_bucket(bucket_name) # Retrieve bucket, datastore client, images requester = RetryRequester(bucket_name=bucket_name, project=project) collection = requester.create_collection(collection_name) if query_entity is True: entity_set = requester.get_entity(filters) images = [] for entity in entity_set: entity_images = requester.client.get_images(entity=entity) images = [x for x in entity_images if x not in images] else: images = requester.get_images(filters) bot.info("Found %s images for suid %s in collection %s" %(len(images), suid, collection_name)) progress = 0 total = len(images) files = [] if len(images) > 0: bot.debug("Saving images and metadata...") for image in images: # Download image file_name = prepare_folders(output_folder=output_folder, image_name=image.key.name) blob = bucket.blob(image['storage_name']) bot.show_progress(progress, total, length=35) requester.download(blob,file_name) files.append(file_name) files.append(save_metadata(image,file_name)) progress+=1 bot.show_progress(progress,total,length=35) # Newline to finish sys.stdout.write('\n') return files