예제 #1
0
def get_notifier():
    '''get notifier will return a basic pyinotify watch manager
    based on the user's inotify watch paths in settings.
    if there is an error, returns None.
    '''

    try:
        import pyinotify
    except ImportError as e:
        bot.error("pyinotify is not installed.")
        return None

    level = get_level()
    wm = pyinotify.WatchManager()
    for path, mask, processor_cls in settings.INOTIFIER_WATCH_PATHS:
        cls_path = '.'.join(processor_cls.split('.')[0:-1])
        cls = processor_cls.split('.')[-1]
        mod = __import__(cls_path, globals(), locals(), [cls], level)
        Processor = getattr(mod, cls)
        wm.add_watch(path, mask, proc_fun=Processor())
        bot.debug("Adding watch on %s, processed by %s" %
                  (path, processor_cls))

    notifier = pyinotify.Notifier(wm)
    return notifier
예제 #2
0
파일: utils.py 프로젝트: IMAGE-ET/sendit
def update_cached(subfolder=None):
    '''
    update the queue (batch object with status QUEUE), intended to be
    run when there are new folders to find and queue.
    First preference goes to a folder supplied to the function, then
    to application defaults. We return None if the result is None.
    '''
    CHECK_FOLDERS = None

    # First preference goes to variable given at runtime
    if subfolder is not None:
        CHECK_FOLDERS = subfolder

    # Second preference goes to DATA_INPUT_FOLDERS
    if DATA_INPUT_FOLDERS not in ['', None]:
        CHECK_FOLDERS = DATA_INPUT_FOLDERS

    # Final preference goes to data subfolder. We don't parse root.
    # The base of data has directories that need to be organized
    if CHECK_FOLDERS is None:
        if DATA_SUBFOLDER is not None:
            CHECK_FOLDERS = "%s/%s" % (DATA_BASE, DATA_SUBFOLDER)
        else:
            bot.error(
                "Specify DATA_INPUT_FOLDERS in settings for cached jobs.")
            return

    if not isinstance(CHECK_FOLDERS, list):
        CHECK_FOLDERS = [CHECK_FOLDERS]

    count = 0
    current = [x.uid for x in Batch.objects.all()]
    for base in CHECK_FOLDERS:
        print('Checking base %s' % base)
        if os.path.exists(base) and os.path.isdir(base):
            # If it's not a date
            if not re.search('[0-9]{10}$', base):
                contenders = [base]
            else:
                contenders = get_contenders(base=base, current=current)
            for contender in contenders:
                dicom_dir = "%s/%s" % (base, contender)
                dcm_folder = os.path.basename(dicom_dir)
                batch, created = Batch.objects.get_or_create(uid=dcm_folder)
                if created is True:
                    batch.status = "QUEUE"
                    batch.logs['DICOM_DIR'] = dicom_dir
                    count += 1
                batch.save()

    print("Added %s contenders for processing queue." % count)
예제 #3
0
def clean_up(bid, remove_batch=False):
    '''clean up will check a batch for errors, and if none exist, clear the entries
    from the database. If no errors occurred, the original folder would have been deleted
    after dicom import.
    '''
    try:         
        batch = Batch.objects.get(id=bid)
    except:
        bot.error("In clean_up: Batch %s does not exist." %(bid))
        return None

    # force clean up for now, we don't have much server space
    has_error = batch.has_error
    has_error = False

    if not has_error:
        images = batch.image_set.all()
        [x.image.delete() for x in images] # deletes image files
        [x.delete() for x in images] # deletes objects
        if remove_batch is True:
            batch.delete() #django-cleanup will delete files on delete
    else:
        bot.warning("Batch %s has error, will not be cleaned up." %batch.id)
예제 #4
0
파일: get.py 프로젝트: pombredanne/sendit
def import_dicomdir(dicom_dir, run_get_identifiers=True):
    '''import dicom directory manages importing a valid dicom set into 
    the application, and is a celery job triggered by the watcher. 
    Here we also flag (and disclude) images that have a header value 
    that indicates pixel identifiers.
    '''
    start_time = time.time()

    if os.path.exists(dicom_dir):
        try:
            dicom_files = ls_fullpath(dicom_dir)
        except NotADirectoryError:
            bot.error('%s is not a directory, skipping.' % dicom_dir)
            return

        bot.debug("Importing %s, found %s .dcm files" %
                  (dicom_dir, len(dicom_files)))

        # The batch --> the folder with a set of dicoms tied to one request
        dcm_folder = os.path.basename(dicom_dir)
        batch, created = Batch.objects.get_or_create(uid=dcm_folder)
        batch.logs['STARTING_IMAGE_COUNT'] = len(dicom_files)

        # Data quality check: keep a record of study dates
        study_dates = dict()
        size_bytes = sum(os.path.getsize(f) for f in dicom_files)
        messages = []  # print all unique messages / warnings at end

        # Add in each dicom file to the series
        for dcm_file in dicom_files:
            try:

                # The dicom folder will be named based on the accession#
                dcm = read_file(dcm_file, force=True)
                dicom_uid = os.path.basename(dcm_file)

                # Keep track of studyDate
                study_date = dcm.get('StudyDate')
                if study_date not in study_dates:
                    study_dates[study_date] = 0
                study_dates[study_date] += 1
                flag, flag_group, reason = has_burned_pixels(
                    dicom_file=dcm_file, quiet=True, deid=STUDY_DEID)

                # If the image is flagged, we don't include and move on
                continue_processing = True
                if flag is True:
                    if flag_group not in ["whitelist"]:
                        continue_processing = False
                        message = "%s is flagged in %s: %s, skipping" % (
                            dicom_uid, flag_group, reason)

                        batch = add_batch_warning(message, batch, quiet=True)
                        message = "BurnedInAnnotation found for batch %s" % batch.uid
                        if message not in messages:
                            messages.append(message)

                if continue_processing is True:
                    # Create the Image object in the database
                    # A dicom instance number must be unique for its batch
                    dicom = Image.objects.create(batch=batch, uid=dicom_uid)

                    # Save the dicom file to storage
                    # basename = "%s/%s" %(batch.id,os.path.basename(dcm_file))
                    dicom = save_image_dicom(dicom=dicom,
                                             dicom_file=dcm_file)  # Also saves

                    # Generate image name based on [SUID] added later
                    # accessionnumberSUID.seriesnumber.imagenumber,
                    name = "%s_%s.dcm" % (dcm.get('SeriesNumber'),
                                          dcm.get('InstanceNumber'))
                    dicom.name = name
                    dicom.save()
                    # Only remove files successfully imported
                    #os.remove(dcm_file)

            # Note that on error we don't remove files
            except InvalidDicomError:
                message = "InvalidDicomError: %s skipping." % (dcm_file)
                batch = add_batch_error(message, batch)
            except KeyError:
                message = "KeyError: %s is possibly invalid, skipping." % (
                    dcm_file)
                batch = add_batch_error(message, batch)
            except Exception as e:
                message = "Exception: %s, for %s, skipping." % (e, dcm_file)

        # Print summary messages all at once
        for message in messages:
            bot.warning(message)

        if len(study_dates) > 1:
            message = "% study dates found for %s" % (len(study_dates),
                                                      dcm_file)
            batch = add_batch_error(message, batch)

        # Save batch thus far
        batch.qa['StudyDate'] = study_dates
        batch.qa['StartTime'] = start_time
        batch.qa['SizeBytes'] = size_bytes
        batch.save()

        # If there were no errors on import, we should remove the directory
        #if not batch.has_error:

        # Should only be called given no error, and should trigger error if not empty
        #os.rmdir(dicom_dir)

        # At the end, submit the dicoms to be anonymized as a batch
        count = batch.image_set.count()
        if count > 0:
            if ANONYMIZE_PIXELS is True:
                bot.warning(
                    "Anonimization of pixels is not yet implemented. Images were skipped."
                )
                # When this is implemented, the function will be modified to add these images
                # to the batch, which will then be first sent through a function to
                # scrub pixels before header data is looked at.
                # scrub_pixels(bid=batch.id)
            #else:
            if run_get_identifiers is True:
                bot.debug("get_identifiers submit batch %s with %s dicoms." %
                          (batch.uid, count))
                return get_identifiers(bid=batch.id)
            else:
                bot.debug("Finished batch %s with %s dicoms" %
                          (batch.uid, count))
                return batch
        else:
            # No images for further processing
            batch.status = "EMPTY"
            batch.qa['FinishTime'] = time.time()
            message = "%s is flagged EMPTY, no images pass filter" % (batch.id)
            batch = add_batch_warning(message, batch)
            batch.save()
            return

    else:
        bot.warning('Cannot find %s' % dicom_dir)
예제 #5
0
def upload_storage(batch_ids=None):
    '''upload storage will as a batch, send all batches with DONEPROCESSING status
    to google cloud storage.
    '''
    from sendit.settings import (GOOGLE_CLOUD_STORAGE,
                                 SEND_TO_GOOGLE,
                                 GOOGLE_PROJECT_NAME,
                                 GOOGLE_STORAGE_COLLECTION)

    if batch_ids is None:
        batches = Batch.objects.filter(status="DONEPROCESSING")
    else:
        batches = Batch.objects.filter(status="DONEPROCESSING", id__in=batch_ids)

    # All variables must be defined for sending!
    if GOOGLE_CLOUD_STORAGE in [None,""]:
        SEND_TO_GOOGLE = False

    if GOOGLE_PROJECT_NAME in [None,""]:
        SEND_TO_GOOGLE = False

    if GOOGLE_STORAGE_COLLECTION in [None,""]:
        SEND_TO_GOOGLE = False

    if SEND_TO_GOOGLE is True:
        from deid.identifiers import get_timestamp

        try:
            client = get_client(bucket_name=GOOGLE_CLOUD_STORAGE,
                                project_name=GOOGLE_PROJECT_NAME)

        # Client is unreachable, usually network is being stressed
        # this is why we instantiate in batches to upload 
        except: #OSError and ServiceUnavailable
            bot.error("Cannot connect to client.")
            return

        # Create/get BigQuery dataset, collection should be IRB
        dataset = client.get_or_create_dataset(GOOGLE_STORAGE_COLLECTION)

        # Create a table based on ...
        table = client.get_or_create_table(dataset=dataset,    # All tables named dicom
                                           table_name='dicom',
                                           schema=dicom_schema)
        
        for batch in batches:
            valid = True
            batch.qa['UploadStartTime'] = time.time()
            batch_ids = BatchIdentifiers.objects.get(batch=batch)
            # Retrieve only images that aren't in PHI folder
            images = batch.get_finished()
            # Stop if no images pass filters
            if len(images) == 0:        
                change_status(batch,"EMPTY")
                message = "batch %s has no images for processing, stopping upload" %(bid)
                batch = add_batch_warning(message,batch)
                batch.save()
                continue

            # IR0001fa6_20160525_IR661B54.tar.gz
            # (coded MRN?)_jittereddate_studycode
            required_fields = ['AccessionNumber', 'PatientID']
            for required_field in required_fields:
                if required_field not in batch_ids.shared:
                    change_status(batch,"ERROR")
                    message = "batch ids %s do not have shared PatientID or AccessionNumber, stopping upload" %(bid)
                    batch = add_batch_warning(message,batch)
                    batch.save()
                    valid = False
                if valid is False:
                    continue

            # Add additional shared metadata
            studycode = batch_ids.shared['AccessionNumber']
            coded_mrn = batch_ids.shared['PatientID']
            batch_ids.shared['CodedPatientID'] = coded_mrn
            batch_ids.shared['ContentType'] = 'application/gzip'
            batch_ids.shared['CodedAccessionNumberID'] = studycode
            batch_ids.shared['NumberOfSeries'] = batch.qa['NumberOfSeries']
            batch_ids.shared['Series'] = batch.qa['Series']
            batch_ids.shared['RemovedSeries'] = batch.qa['FlaggedSeries']
            timestamp = get_timestamp(batch_ids.shared['StudyDate'],
                                      format = "%Y%m%d")            
            compressed_filename = "%s/%s_%s_%s.tar.gz" %(batch.get_path(),
                                                         coded_mrn,
                                                         timestamp,
                                                         studycode)
            compressed_file = generate_compressed_file(files=images, # mode="w:gz"
                                                       filename=compressed_filename) 
            # File will be None if no files added
            if compressed_file is None:        
                change_status(batch,"ERROR")
                message = "batch %s problem compressing file, stopping upload" %(bid)
                batch = add_batch_error(message,batch)
                batch.save()
                valid = False
                continue

            # We prepare shared metadata for one item
            batch_ids.shared['IMAGE_COUNT'] = len(images)
            batch.logs['IMAGE_COUNT'] = len(images)
            batch_ids.save()
            batch.save()
            if valid is True:
                metadata = deepcopy(batch_ids.shared)
                metadata['DicomHeader'] = json.dumps(metadata)
                metadata = { compressed_file: metadata }
                bot.log("Uploading %s with %s images to Google Storage %s" %(os.path.basename(compressed_file),
                                                                         len(images),
                                                                         GOOGLE_CLOUD_STORAGE))
                # We only expect to have one entity per batch
                kwargs = {"items":[compressed_file],
                          "table":table,
                          "study": SOM_STUDY,
                          "metadata": metadata,
                          "batch": False} # upload in batches at END

                # Batch metadata    
                upload_dataset(client=client, k=kwargs)

                # Clean up compressed file
                if os.path.exists(compressed_file):
                    os.remove(compressed_file)

                # Finish and record time elapsed
                change_status(batch,"DONE")

            batch.qa['UploadFinishTime'] = time.time()
            total_time = batch.qa['UploadFinishTime'] - batch.qa['UploadStartTime']
            bot.info("Total time for %s: %s images is %f min" %(batch.uid,
                                                                batch.image_set.count(),
                                                                total_time/60))
            batch.qa['ElapsedTime'] = total_time
            batch.save()

        # After image upload, metadata can be uploaded on one batch
        # If this isn't optimal, change "batch" in kwargs to False
        return client.batch.runInsert(table)
예제 #6
0
def upload_storage(batch_ids=None):
    '''upload storage will as a batch, send all batches with DONEPROCESSING status
    to google cloud storage.
    '''
    from sendit.settings import (GOOGLE_CLOUD_STORAGE, SEND_TO_GOOGLE,
                                 GOOGLE_PROJECT_NAME, GOOGLE_PROJECT_ID_HEADER,
                                 GOOGLE_STORAGE_COLLECTION)

    if batch_ids is None:
        batches = Batch.objects.filter(status="DONEPROCESSING")
    else:
        batches = Batch.objects.filter(status="DONEPROCESSING",
                                       id__in=batch_ids)

    # All variables must be defined for sending!
    if GOOGLE_CLOUD_STORAGE in [None, ""]:
        SEND_TO_GOOGLE = False

    if GOOGLE_PROJECT_NAME in [None, ""]:
        SEND_TO_GOOGLE = False

    if GOOGLE_STORAGE_COLLECTION in [None, ""]:
        SEND_TO_GOOGLE = False

    if SEND_TO_GOOGLE is True:
        from deid.identifiers import get_timestamp

        # I'm not sure we need this
        #if GOOGLE_PROJECT_ID_HEADER is not None:
        #    client.headers["x-goog-project-id"] = GOOGLE_PROJECT_ID_HEADER
        try:
            client = get_client(bucket_name=GOOGLE_CLOUD_STORAGE,
                                project_name=GOOGLE_PROJECT_NAME)
        # Client is unreachable, usually network is being stressed

        except:  #OSError and ServiceUnavailable
            bot.error("Cannot connect to client.")
            return

        collection = client.create_collection(uid=GOOGLE_STORAGE_COLLECTION)
        for batch in batches:
            valid = True
            batch_ids = BatchIdentifiers.objects.get(batch=batch)

            # Retrieve only images that aren't in PHI folder
            images = batch.get_finished()

            # Stop if no images pass filters
            if len(images) == 0:
                change_status(batch, "EMPTY")
                message = "batch %s has no images for processing, stopping upload" % (
                    batch.id)
                batch = add_batch_warning(message, batch)
                batch.save()
                continue

            # IR0001fa6_20160525_IR661B54.tar.gz
            # (coded MRN?)_jittereddate_studycode
            required_fields = ['AccessionNumber', 'PatientID']
            for required_field in required_fields:
                if required_field not in batch_ids.shared:
                    change_status(batch, "ERROR")
                    message = "batch ids %s do not have shared PatientID or AccessionNumber, stopping upload" % (
                        bid)
                    batch = add_batch_warning(message, batch)
                    batch.save()
                    valid = False
                if valid is False:
                    continue

            studycode = batch_ids.shared['AccessionNumber']
            coded_mrn = batch_ids.shared['PatientID']
            timestamp = get_timestamp(batch_ids.shared['StudyDate'],
                                      format="%Y%m%d")

            compressed_filename = "%s/%s_%s_%s.tar.gz" % (
                batch.get_path(), coded_mrn, timestamp, studycode)
            compressed_file = generate_compressed_file(
                files=images,  # mode="w:gz"
                filename=compressed_filename)

            # File will be None if no files added
            if compressed_file is None:
                change_status(batch, "ERROR")
                message = "batch %s problem compressing file, stopping upload" % (
                    bid)
                batch = add_batch_error(message, batch)
                batch.save()
                valid = False
                continue

            # We prepare shared metadata for one item
            batch_ids.shared['IMAGE_COUNT'] = len(images)
            batch.logs['IMAGE_COUNT'] = len(images)
            batch_ids.save()
            batch.save()
            if valid is True:
                items_metadata = batch_ids.shared
                items = {compressed_file: items_metadata}
                cleaned = deepcopy(batch_ids.cleaned)
                metadata = prepare_entity_metadata(cleaned_ids=cleaned)
                bot.log("Uploading %s with %s images to Google Storage %s" %
                        (os.path.basename(compressed_file), len(images),
                         GOOGLE_CLOUD_STORAGE))
                # We only expect to have one entity per batch
                uid = list(metadata.keys())[0]
                kwargs = {
                    "images": [compressed_file],
                    "collection": collection,
                    "uid": uid,
                    "entity_metadata": metadata[uid],
                    "images_metadata": items
                }

                # Batch metadata
                upload_dataset(client=client, k=kwargs)

                # Clean up compressed file
                if os.path.exists(compressed_file):
                    os.remove(compressed_file)

                # Finish and record time elapsed
                change_status(batch, "DONE")

            batch.qa['FinishTime'] = time.time()
            total_time = batch.qa['FinishTime'] - batch.qa['StartTime']
            bot.info("Total time for %s: %s images is %f min" %
                     (batch.uid, batch.image_set.count(), total_time / 60))
            batch.qa['ElapsedTime'] = total_time
            batch.save()