def create_chunks(): """ Creates gets unaugmented positive chunks and saves them to positive_no_aug. :return: """ client = cloud.authenticate() bucket = client.get_bucket('elvos') # loop through every positive array on GCS -- no need to loop through # negatives, as those are fine in their current state for in_blob in bucket.list_blobs(prefix='chunk_data/normal/positive'): # get the file id file_id = in_blob.name.split('/')[3] file_id = file_id.split('.')[0] logging.info(f'getting {file_id}') # copy region if it's the original image, not a rotation/reflection if file_id.endswith('_1'): arr = cloud.download_array(in_blob) logging.info(f'downloading {file_id}') cloud.save_chunks_to_cloud(arr, 'normal', 'positive_no_aug', file_id)
def inspect_rois(annotations_df): """ Sanity-check function to make sure that the ROIs we're getting actually contain occlusions in them. :param annotations_df: annotations :return: """ client = cloud.authenticate() bucket = client.get_bucket('elvos') # loop through every array on GCS for in_blob in bucket.list_blobs(prefix='airflow/npy'): # if in_blob.name != 'airflow/npy/ZZX0ZNWG6Q9I18GK.npy': # continue # blacklist if in_blob.name == 'airflow/npy/LAUIHISOEZIM5ILF.npy': continue # get the file id file_id = in_blob.name.split('/')[2] file_id = file_id.split('.')[0] logging.info(f'chunking {file_id}') # copy ROI if there's a positive match in the ROI annotations roi_df = annotations_df[annotations_df['patient_id'].str.match( file_id)] # if it's empty, this brain is ELVO negative if roi_df.empty: elvo_positive = False else: elvo_positive = True arr = cloud.download_array(in_blob) # if it's elvo positive if elvo_positive: chunks = [] # get ROI location blue = int(len(arr) - roi_df['blue2'].iloc[0]) green = int(roi_df['green1'].iloc[0]) red = int(roi_df['red1'].iloc[0]) chunks.append(arr[blue:blue + 32, green:green + 50, red:red + 50]) chunks.append(arr[blue:blue + 32, red:red + 50, green:green + 50]) # Loop through all relevant chunks and show the axial, coronal, # and sagittal views to make sure there's an occlusion for chunk in chunks: axial = np.max(chunk, axis=0) coronal = np.max(chunk, axis=1) sag = np.max(chunk, axis=2) fig, ax = plt.subplots(1, 3, figsize=(6, 4)) ax[0].imshow(axial, interpolation='none') ax[1].imshow(coronal, interpolation='none') ax[2].imshow(sag, interpolation='none') plt.show()
def normal_mip(): configure_logger() client = cloud.authenticate() bucket = client.get_bucket('elvos') # iterate through every source directory... for location in WHENCE: prefix = location + '/' logging.info(f"MIPing images from {prefix}") # get every blob for in_blob in bucket.list_blobs(prefix=prefix): # blacklist if in_blob.name == prefix + 'LAUIHISOEZIM5ILF.npy': continue file_id = in_blob.name.split('/')[2] file_id = file_id.split('.')[0] # perform the normal MIPing procedure logging.info(f'downloading {in_blob.name}') input_arr = cloud.download_array(in_blob) logging.info(f"blob shape: {input_arr.shape}") # if it's a failure analysis scan, do the failure analysis MIP if file_id in FAILURE_ANALYSIS: if location == 'numpy/axial': cropped_arr = transforms.crop_normal_axial_fa(input_arr, location) # otherwise just do a normal MIP else: if location == 'numpy/axial': cropped_arr = transforms.crop_normal_axial(input_arr, location) else: cropped_arr = transforms.crop_normal_coronal(input_arr, location) # remove extremes not_extreme_arr = transforms.remove_extremes(cropped_arr) logging.info(f'removed array extremes') # MIP array mip_arr = transforms.mip_normal(not_extreme_arr) # OPTIONAL: visualize MIP # plt.figure(figsize=(6, 6)) # plt.imshow(mip_arr, interpolation='none') # plt.show() # save to cloud cloud.save_npy_to_cloud(mip_arr, file_id, location, 'normal')
def multichannel_mip(): configure_logger() client = cloud.authenticate() bucket = client.get_bucket('elvos') # iterate through every source directory... for location in WHENCE: prefix = location + '/' logging.info(f"MIPing images from {prefix}") for in_blob in bucket.list_blobs(prefix=prefix): # blacklist if in_blob.name == prefix + 'LAUIHISOEZIM5ILF.npy': continue file_id = in_blob.name.split('/')[2] file_id = file_id.split('.')[0] # perform the normal MIPing procedure logging.info(f'downloading {in_blob.name}') input_arr = cloud.download_array(in_blob) logging.info(f"blob shape: {input_arr.shape}") if file_id in FAILURE_ANALYSIS: if location == 'numpy/axial': cropped_arr = \ transforms.crop_multichannel_axial_fa(input_arr, location) else: if location == 'numpy/axial': cropped_arr = transforms.crop_multichannel_axial( input_arr, location) else: cropped_arr = transforms.crop_multichannel_coronal( input_arr) not_extreme_arr = transforms.segment_vessels(cropped_arr) logging.info(f'removed array extremes') mip_arr = transforms.mip_multichannel(not_extreme_arr) # plt.figure(figsize=(6, 6)) # plt.imshow(mip_arr[1], interpolation='none') # plt.show() # save to the numpy generator source directory cloud.save_segmented_npy_to_cloud(mip_arr, file_id, location, 'multichannel')
def axial_to_coronal_and_sagittal(): configure_logger() client = cloud.authenticate() bucket = client.get_bucket('elvos') # for every axial scan for in_blob in bucket.list_blobs(prefix='numpy/axial'): # blacklist if in_blob.name == 'numpy/LAUIHISOEZIM5ILF.npy': continue elif in_blob.name == 'numpy/ALOUY4SF3BQKXQCZ.npy': continue elif in_blob.name == 'numpy/ABPO2BORDNF3OVL3.npy': continue # download, then transpose, then flip it to orient it correctly logging.info(f'downloading {in_blob.name}') axial = cloud.download_array(in_blob) coronal = np.transpose(axial, (1, 0, 2)) coronal = np.fliplr(coronal) sagittal = np.transpose(axial, (2, 0, 1)) sagittal = np.fliplr(sagittal) file_id = in_blob.name.split('/')[1] file_id = file_id.split('.')[0] try: # save files to GCS coronal_io = file_io.FileIO( f'gs://elvos/numpy/coronal/' f'{file_id}.npy', 'w') np.save(coronal_io, coronal) sagittal_io = file_io.FileIO( f'gs://elvos/numpy/sagittal/' f'{file_id}.npy', 'w') np.save(sagittal_io, sagittal) coronal_io.close() sagittal_io.close() except Exception as e: logging.error(f'for patient ID: {file_id} {e}') break logging.info(f'saved .npy file to cloud')
def transform_positives(): """ Script that actually transforms and upsamples all the positives. :return: """ client = cloud.authenticate() bucket = client.get_bucket('elvos') prefix = "chunk_data/filtered/positive" logging.info(f"transforming positive chunks from {prefix}") # for each blob in chunk_data/filtered/positive for in_blob in bucket.list_blobs(prefix=prefix): file_id = in_blob.name.split('/')[3] file_id = file_id.split('.')[0] # download chunk logging.info(f'downloading {in_blob.name}') input_arr = cloud.download_array(in_blob) logging.info(f"blob shape: {input_arr.shape}") # upsample chunk transform_one(input_arr, file_id)
train_chunks = [] train_labels = [] val_chunks = [] val_labels = [] test_chunks = [] test_labels = [] # get positive train chunks and labels i = 1 for id_, label in list(positive_train_label_data.items()): if i % 500 == 0: logging.info(f'got chunk {i}') i += 1 blob = bucket.get_blob('chunk_data/normal/positive/' + id_ + '.npy') arr = cloud_management.download_array(blob) if arr.shape == (32, 32, 32): arr = np.expand_dims(arr, axis=-1) train_chunks.append(arr) train_labels.append(label) logging.info(f'{i} total positive training chunks') # get negative train chunks and labels i = 1 for id_, label in list(negative_train_label_data.items()): if i % 500 == 0: logging.info(f'got chunk {i}') i += 1 blob = bucket.get_blob('chunk_data/normal/negative/' + id_ + '.npy') arr = cloud_management.download_array(blob) if arr.shape == (32, 32, 32):
def main(): configure_logger() # Access Google Cloud Storage gcs_client = storage.Client.from_service_account_json( '/home/harold_triedman/elvo-analysis/credentials/client_secret.json' # 'credentials/client_secret.json' ) bucket = gcs_client.get_bucket('elvos') # Get label data from Google Cloud Storage blob = storage.Blob('augmented_annotated_labels.csv', bucket) blob.download_to_filename('tmp/augmented_annotated_labels.csv') prelim_label_data = {} # load labels from augmented_annotated_labels.csv with open('tmp/augmented_annotated_labels.csv', 'r') as pos_file: reader = csv.reader(pos_file, delimiter=',') for row in reader: if row[1] != 'Unnamed: 0.1': prelim_label_data[row[1]] = int(row[2]) # prelim_label_data[row[2]] = int(row[3]) # Get all of the positives from the label data positive_label_data = {} logging.info('getting 12168 positive labels') for id_, label in list(prelim_label_data.items()): if label == 1 and '_' in id_: positive_label_data[id_] = label positive_train_label_data = {} positive_val_label_data = {} train = {} val = {} # Loop through positives for i, id_ in enumerate(list(positive_label_data.keys())): if i % 24 == 0: # Split into train/val sets based off of random flips seed = random.randint(1, 100) stripped_id = id_[:-1] meta_id = id_[:16] # add ID to positive train data and train metadata if seed > 10: positive_train_label_data[id_] = 1 train[meta_id] = '' for j in range(2, 25): positive_train_label_data[stripped_id + str(j)] = 1 # add ID to positive val data and val metadata else: positive_val_label_data[id_] = 1 val[meta_id] = '' for j in range(2, 25): positive_val_label_data[stripped_id + str(j)] = 1 # Get 14500 random negatives from the label data to feed into our generator negative_counter = 0 negative_train_label_data = {} negative_val_label_data = {} logging.info("getting 14500 random negative labels") while negative_counter < 14500: # Get random chunk id_, label = random.choice(list(prelim_label_data.items())) # if it's a negative if label == 0: if negative_counter % 500 == 0: logging.info(f'gotten {negative_counter} labels so far') meta_id = id_[:16] # if another chunk in this brain is in train metadata dict if meta_id in train: negative_train_label_data[id_] = label # else if another chunk in this brain is in val metadata dict elif meta_id in val: negative_val_label_data[id_] = label # otherwise flip a coin to see where it's going to end up else: seed = random.randint(1, 100) if seed > 10: negative_train_label_data[id_] = label train[meta_id] = '' else: negative_val_label_data[id_] = label val[meta_id] = '' # delete it from prelim_label_data to ensure no re-picks del prelim_label_data[id_] negative_counter += 1 # save train/val metadata train_df = pd.DataFrame.from_dict(train, orient='index') val_df = pd.DataFrame.from_dict(val, orient='index') train_df.to_csv('train_ids.csv') val_df.to_csv('val_ids.csv') train_chunks = [] train_labels = [] val_chunks = [] val_labels = [] # Get positive train chunks i = 1 for id_, label in list(positive_train_label_data.items()): if i % 500 == 0: logging.info(f'got chunk {i}') i += 1 blob = bucket.get_blob('chunk_data/normal/positive/' + id_ + '.npy') arr = cloud_management.download_array(blob) if arr.shape == (32, 32, 32): arr = np.expand_dims(arr, axis=-1) train_chunks.append(arr) train_labels.append(label) logging.info(f'{i} total positive training chunks') # Get positive val chunks i = 1 for id_, label in list(positive_val_label_data.items()): if i % 500 == 0: logging.info(f'got chunk {i}') i += 1 blob = bucket.get_blob('chunk_data/normal/positive/' + id_ + '.npy') arr = cloud_management.download_array(blob) if arr.shape == (32, 32, 32): arr = np.expand_dims(arr, axis=-1) val_chunks.append(arr) val_labels.append(label) logging.info(f'{i} total positive validation chunks') # Get negative train chunks i = 1 for id_, label in list(negative_train_label_data.items()): if i % 500 == 0: logging.info(f'got chunk {i}') i += 1 blob = bucket.get_blob('chunk_data/normal/negative/' + id_ + '.npy') arr = cloud_management.download_array(blob) if arr.shape == (32, 32, 32): arr = np.expand_dims(arr, axis=-1) train_chunks.append(arr) train_labels.append(label) logging.info(f'{i} total negative chunks') # Get negative val chunks i = 1 for id_, label in list(negative_val_label_data.items()): if i % 500 == 0: logging.info(f'got chunk {i}') i += 1 blob = bucket.get_blob('chunk_data/normal/negative/' + id_ + '.npy') arr = cloud_management.download_array(blob) if arr.shape == (32, 32, 32): arr = np.expand_dims(arr, axis=-1) val_chunks.append(arr) val_labels.append(label) logging.info(f'{i} total negative chunks') # shuffle order of training data tmp = list(zip(train_chunks, train_labels)) random.shuffle(tmp) train_chunks, train_labels = zip(*tmp) # shuffle order of validation data tmp = list(zip(val_chunks, val_labels)) random.shuffle(tmp) val_chunks, val_labels = zip(*tmp) # Turn into numpy arrays logging.info('splitting based on validation split') full_x_train = np.asarray(train_chunks) full_y_train = np.asarray(train_labels) x_val = np.asarray(val_chunks) y_val = np.asarray(val_labels) logging.info(f'{len(train_chunks)} total chunks to train with') logging.info(f'full training data: {full_x_train.shape},' f'{full_y_train.shape}') logging.info(f'full validation data: {x_val.shape}, {y_val.shape}') full_arr = np.array([full_x_train, full_y_train, x_val, y_val]) # Save to compressed pickle to maintain ordering with open('chunk_data_separated_ids.pkl', 'wb') as outfile: pickle.dump(full_arr, outfile, pickle.HIGHEST_PROTOCOL)
def create_chunks(annotations_df: pd.DataFrame): """ Process and save actual chunks based off of the previously derived annotations. :param annotations_df: annotations with where the actual occlusion is :return: """ client = cloud.authenticate() bucket = client.get_bucket('elvos') # loop through every array on GCS for in_blob in bucket.list_blobs(prefix='airflow/npy'): # blacklist if in_blob.name == 'airflow/npy/LAUIHISOEZIM5ILF.npy': continue # get the file id file_id = in_blob.name.split('/')[2] file_id = file_id.split('.')[0] print(f'chunking {file_id}') # copy ROI if there's a positive match in the ROI annotations roi_df = annotations_df[annotations_df['patient_id'].str.match( file_id)] # if it's empty, this brain is ELVO negative if roi_df.empty: elvo_positive = False else: elvo_positive = True arr = cloud.download_array(in_blob) rois = [] centers = [] # if it's elvo positive if elvo_positive: # iterate through every occlusion this patient has for row in roi_df.itertuples(): """ row[0] = index row[1] = patient ID row[2] = red1 row[3] = red2 row[4] = green1 row[5] = green2 row[6] = blue1 row[7] = blue2 """ # append the lowest-valued corner of the ROI to rois rois.append((int(len(arr) - row[7]), int(row[4]), int(row[2]))) # append the center of the ROI to centers centers.append( (int(((len(arr) - row[6]) + (len(arr) - row[7])) / 2), int((row[4] + row[5]) / 2), int((row[2] + row[3]) / 2))) logging.info(rois, centers) h = 0 # loop through every chunk for i in range(0, len(arr), 32): for j in range(0, len(arr[0]), 32): for k in range(0, len(arr[0][0]), 32): found_positive = False # loop through the available ROIs and centers for roi, center in zip(rois, centers): # if the center lies within this chunk if i <= center[0] <= i + 32 \ and j <= center[1] <= j + 32 \ and k <= center[2] <= k + 32: # save the ROI and skip this block chunk = arr[roi[0]:roi[0] + 32, roi[1]:roi[1] + 32, roi[2]:roi[2] + 32] cloud.save_chunks_to_cloud(np.asarray(chunk), 'normal', 'positive', file_id + str(h)) h += 1 found_positive = True if found_positive: continue # copy the chunk chunk = arr[i:(i + 32), j:(j + 32), k:(k + 32)] # calculate the airspace airspace = np.where(chunk < -300) # if it's less than 90% airspace if (airspace[0].size / chunk.size) < 0.9: # save the label as 0 and save it to the cloud cloud.save_chunks_to_cloud(np.asarray(chunk), 'normal', 'negative', file_id + str(h)) h += 1
def create_labels(annotations_df: pd.DataFrame): """ Process and save labels for the chunks based off of previously-derived annotations. Very similar to create_chunks in methodology :param annotations_df: annotations to get labels from :return: """ client = cloud.authenticate() bucket = client.get_bucket('elvos') label_dict = {} # loop through every array on GCS for in_blob in bucket.list_blobs(prefix='airflow/npy'): # blacklist if in_blob.name == 'airflow/npy/LAUIHISOEZIM5ILF.npy': continue # get the file id file_id = in_blob.name.split('/')[2] file_id = file_id.split('.')[0] logging.info(f'labeling {file_id}') # copy ROI if there's a positive match in the ROI annotations roi_df = annotations_df[annotations_df['patient_id'].str.match( file_id)] # if it's empty, this brain is ELVO negative if roi_df.empty: elvo_positive = False else: elvo_positive = True arr = cloud.download_array(in_blob) rois = [] centers = [] # if it's elvo positive if elvo_positive: # go through each occlusion this patient has for row in roi_df.itertuples(): """ row[0] = index row[1] = patient ID row[2] = red1 row[3] = red2 row[4] = green1 row[5] = green2 row[6] = blue1 row[7] = blue2 """ # append ROI to rois rois.append((int(len(arr) - row[7]), int(row[4]), int(row[2]))) # append center to centers centers.append( (int(((len(arr) - row[6]) + (len(arr) - row[7])) / 2), int((row[4] + row[5]) / 2), int((row[2] + row[3]) / 2))) # else it's elvo negative h = 0 # loop through every chunk for i in range(0, len(arr), 32): for j in range(0, len(arr[0]), 32): for k in range(0, len(arr[0][0]), 32): found_positive = False # loop through the available ROIs and centers for roi, center in zip(rois, centers): # if the center lies within this chunk if i <= center[0] <= i + 32 \ and j <= center[1] <= j + 32 \ and k <= center[2] <= k + 32: # save the ROI and skip this block label_dict[file_id + str(h)] = 1 h += 1 found_positive = True if found_positive: continue # copy the chunk chunk = arr[i:(i + 32), j:(j + 32), k:(k + 32)] # calculate the airspace airspace = np.where(chunk < -300) # if it's less than 90% airspace if (airspace[0].size / chunk.size) < 0.9: # save the label as 0 and save it to the cloud label_dict[file_id + str(h)] = 0 h += 1 # convert the labels to a df labels_df = pd.DataFrame.from_dict(label_dict, orient='index', columns=['label']) labels_df.to_csv('annotated_labels.csv')