import matplotlib.pyplot as plt import seaborn as sns id = [] diameter = [] surface_area = [] volume = [] Sphericity = [] for i in range(1, 200): if i < 10: s = '000' + str(i) elif i < 100: s = '00' + str(i) else: s = '0' + str(i) pid = 'LIDC-IDRI-{}'.format(s) scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first() nodules = scan.cluster_annotations() if not nodules: continue for idx, nodule in enumerate(nodules): id.append(pid[-4:] + '_' + str(idx)) diameter.append(nodule[0].diameter) surface_area.append(nodule[0].surface_area) volume.append(nodule[0].volume) Sphericity.append(nodule[0].Sphericity) print(".", end='') print() data_dic = {} # data_dic['id'] = id data_dic['diameter'] = diameter
import pylidc as pl for subjectID in range(1, 1013): s = 'LIDC-IDRI-%04i' % subjectID scans = pl.query(pl.Scan).filter(pl.Scan.patient_id == s) if scans.count() > 1: print("%s has %d scans" % (s, scans.count())) for i, scan in enumerate(scans): print(" Scan %d has %d annotations" % (i + 1, len(scan.annotations)))
# errors in LIDC-IDRI-0052, LIDC-IDRI-0065, LIDC-IDRI-0068 # if idx<2:continue # if idx == 5: break # if idx<=15:continue # if idx>20:continue name_original = k k = k.split('_block')[0] df_patient = df.loc[df['patientid'] == int(k[-4:])] pid = k # query the LIDC images with patient_id = pid # HERE WE JUST USE THE FIRST ONE!! idx_scan = 0 # get the scan object for this scan scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid)[idx_scan] # here we can reject according to any criteria we like thickSlice = (scan.slice_thickness > 3) | (scan.slice_spacing > 3) missingSlices = len(np.unique(np.round( 100 * np.diff(scan.slice_zvals)))) != 1 if (thickSlice): # we want to reject this scan/patient print('Undesirable slice characteristics, rejecting') listOfRejectedPatients.append(pid) #continue raise ValueError('Undesirable slice characteristics, rejecting') elif (missingSlices): print('Missing slices, rejecting') listOfRejectedPatients.append(pid) #continue
with open('error.csv', 'w') as f: writer = csv.writer(f) writer.writerow(['short_name', 'cluster']) with open('nodule_size.csv', 'w') as f: writer = csv.writer(f) writer.writerow(['short_name', 'cluster', 'size']) with open('scan_size.csv', 'w') as f: writer = csv.writer(f) writer.writerow(['short_name', 'size']) # Generate mask from the four annotations for each nodule. # Only nodules with four annotations available are included. # Only voxels marked by three or four radiologists are regarded as nodule voxels. all_scans = pl.query(pl.Scan) config['patch_size'] = 64 savepath = config['savepath'] start = time.clock() num_scans = all_scans.count() #num = 2 all_anns = pl.query(pl.Annotation).join(pl.Scan) for scan_id in range(num_scans): anns = all_anns.filter(pl.Annotation.scan_id == scan_id) scan = all_scans[scan_id] shortname_scan = '0' * (4 - len(str(scan_id))) + str(scan_id) try: path = scan.get_path_to_dicom_files() except AssertionError: print(shortname_scan, 'does not exist')
def get_texture(self, texture_val): return pl.query(pl.Annotation).filter(pl.Annotation.texture <= texture_val).all()
def get_contour(self, annotation): return pl.query(pl.Contour).filter(pl.Contour.annotation_id == annotation.id).all()
def get_texture_1(): texture_1 = pl.query(pl.Annotation).filter(pl.Annotation.texture == 1) return texture_1.all()
TEST_MODE = False print(f"test mode: {TEST_MODE}") file = open(LOG_FILE, "w+") # The LIDC database contains annotations of up to 4 radiologist per nodule. # We need to combine these annotations. Luckily, the pylidc module provides a way to cluster annotations from overlapping nodules # It turns out that 'nodule_id' does not refer to a nodule at all, they do not overlap. # Luckily, pylidc has functionality built in to determine which nodules belong together # # Extract annotations to dataframe (note: using pd.read_sql_table might be better but I couldn't figure out which connection to use) # ## Load scans with pylidc # Create dataframe with scan information scans = pl.query(pl.Scan).all() scan_dict = {} for scan in scans: patient_id = scan.patient_id[-4:] if patient_id in scan_dict.keys(): print(f"patient with multiple scans: {patient_id}; ", end="") patient_id = str(format(int(patient_id) + int(2000))) print(f"new id: {patient_id}") scan_dict[patient_id] = scan assert len(scan_dict.keys()) == 1018 if not (RESOURCES_DIR / "scan_df.csv").exists(): scan_df_dict = {} print("preparing scan dataframe") for patient_id, scan in tqdm(scan_dict.items()): # TODO add scan-id here scan_df_dict[patient_id] = {
def check_nodule_intersections(patch_size=144, res='Legacy'): recluster_using_cliques = False pat_with_nod = 0 pat_without_nod = 0 nodule_count = 0 max_size = 0 min_size = 999999 min_dist = 999999 outliers = [] size_list = [] global_size_list = [] pause = 0 for scan in pl.query(pl.Scan).all()[:]: if len(scan.annotations) == 0: continue # cluster by intersection tol = 0.95 nods, D = scan.cluster_annotations(metric='jaccard', tol=tol, return_distance_matrix=True) if len(nods) == 0: pat_without_nod += 1 continue pat_with_nod += 1 if recluster_using_cliques: adjacency = D <= tol if adjacency.shape[0] > 1: clusters = cluster_by_cliques(adjacency, None) print( "Study ({}), Series({}) of patient {}: {} connected components. {} cliques" .format(scan.study_instance_uid, scan.series_instance_uid, scan.patient_id, len(nods), len(clusters))) nodule_count += len(nods) if len(nods) != len( clusters): #[[n.id for n in anns] for anns in nods] pause = pause + 1 mds(scan=scan, clusters=clusters, distance_matrix=D) # re-cluster nodules by cliques nods = [[scan.annotations[i] for i in ids] for ids in clusters] else: clusters = [[0]] else: id_0 = np.min([ann.id for ann in scan.annotations]) clusters = [[ann.id - id_0 for ann in cluster] for cluster in nods] centers = [] boxes = [] for cluster in clusters: nod = [scan.annotations[ann_id] for ann_id in cluster] print("Nodule of patient {} with {} annotations.".format( scan.patient_id, len(nod))) min_ = reduce((lambda x, y: np.minimum(x, y)), [ann.bbox()[:, 0] for ann in nod]) max_ = reduce((lambda x, y: np.maximum(x, y)), [ann.bbox()[:, 1] for ann in nod]) size = scan.pixel_spacing * (max_ - min_ + 1) size_list.append(size) if np.max(size) >= 64: print("\tNodule Size = {:.1f} x {:.1f} x {:.1f}".format( size[0], size[1], size[2])) if size[2] == 1: print("\t\tNodule BB = {}".format( [ann.bbox()[:, 0] for ann in nod])) max_size = np.maximum(max_size, size) min_size = np.minimum(min_size, size) centers.append(scan.pixel_spacing * min_ + size // 2) boxes.append( np.vstack( [scan.pixel_spacing * min_, scan.pixel_spacing * max_])) cluster_candidates = [] for i, nod_i in enumerate(nods): j_outs = [] for j, nod_j in enumerate(nods): if i == j: continue #if centers[i][2] < boxes[j][0][2]: # ignore if cross-section of i doesn't contain j # continue #if centers[i][2] > boxes[j][1][2]: # ignore if cross-section of i doesn't contain j # continue dist = np.abs(centers[i] - boxes[j]) dist = np.min(dist, axis=0) dist = np.max(dist) min_dist = np.minimum(min_dist, dist) if dist > 32: continue if dist > 10: stop = 1 print("\tDist = {}".format(dist)) min_ = np.minimum(boxes[i][0], boxes[j][0]) max_ = np.maximum(boxes[i][1], boxes[j][1]) size = (max_ - min_ + 1) print("\t\tMerged ({}, {}) Size = {:.1f} x {:.1f} x {:.1f}". format(i, j, size[0], size[1], size[2])) j_outs.append(j) outliers.append((dist, np.max(size))) if len(j_outs) > 1: boxes = np.array(boxes) min_ = reduce((lambda x, y: np.minimum(x, y)), [bb[0, :] for bb in boxes[j_outs + [i]]]) max_ = reduce((lambda x, y: np.maximum(x, y)), [bb[1, :] for bb in boxes[j_outs + [i]]]) size = (max_ - min_ + 1) if np.any(size > 60): stop = 1 print( "\t\t Global Merged ({}, {}) Size = {:.1f} x {:.1f} x {:.1f}" .format(i, j_outs, size[0], size[1], size[2])) global_size_list.append(np.max(size)) print("=" * 30) print("Prepared {} entries".format(nodule_count)) print("{} patients with nodules, {} patients without nodules".format( pat_with_nod, pat_without_nod)) print("\tMax Size = {:.1f} x {:.1f} x {:.1f}".format( max_size[0], max_size[1], max_size[2])) print("\tMin Size = {:.1f} x {:.1f} x {:.1f}".format( min_size[0], min_size[1], min_size[2])) print("\tMin Dist = {}".format(min_dist)) print("== Number of cluster breaks = {} ==".format(pause)) x_dist = [o[0] for o in outliers] y_size = [o[1] for o in outliers] plt.figure() plt.subplot(311) plt.title('Nodule (cluster) Size') plt.xlabel('size') plt.ylabel('hist') plt.hist(np.max(size_list, axis=1), 50) plt.subplot(312) plt.title('Pairwise-Merges') plt.xlabel('dist') plt.ylabel('merged size') plt.scatter( np.array(x_dist).astype('uint'), np.array(y_size).astype('uint')) plt.subplot(313) plt.title('Total Size') plt.xlabel('size') plt.ylabel('hist') plt.hist(global_size_list, 50) plt.show()
def get_scan(): scan = pl.query(pl.Scan).filter(pl.Annotation.texture == 1) return scan.all()
def extract(patch_size=144, res='Legacy', dump=True): filename = 'NodulePatches{}-{}.p'.format(patch_size, res) dataset = [] nodSize = [] pat_with_nod = 0 pat_without_nod = 0 patient_nodules = {} if dump is False: print("Running without dump") for scan in pl.query(pl.Scan).all()[:]: # cycle 1018 scans # # Example for debuging: # scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == 'LIDC-IDRI-0004').first() # nods = scan.cluster_annotations(metric='jaccard', tol=0.95, tol_limit=0.7) if len(nods) > 0: pat_with_nod += 1 print("Study ({}), Series({}) of patient {}: {} nodules.".format( scan.study_instance_uid, scan.series_instance_uid, scan.patient_id, len(nods))) patient_nodules['scan.patient_id'] = len(nods) dicom = scan.load_all_dicom_images(verbose=False) for nod in nods: print("Nodule of patient {} with {} annotations.".format( scan.patient_id, len(nod))) largestSliceA = [getLargestSliceInBB(ann)[0] for ann in nod ] # larget slice within annotated bb annID = np.argmax( largestSliceA ) # which of the annotation has the largest slice largestSliceZ = [getLargestSliceInBB(ann)[1] for ann in nod] # index within the mask z = interpolateZfromBBidx( nod[annID], largestSliceZ[annID]) # just for the entry data # possible mismatch betwean retrived z and largestSliceZ[annID] due to missing dicom files # if res is 'Legacy': di_slice = getSlice(dicom, z, rescale=True) mask = get_full_size_mask(nod[annID], di_slice.shape) patch = cropSlice(di_slice, nod[annID].centroid(), patch_size) mask = cropSlice(mask, nod[annID].centroid(), patch_size) else: vol0, seg0 = nod[annID].uniform_cubic_resample( side_length=(patch_size - 1), resolution=res, verbose=0) largestSliceZ = np.argmax( np.sum(seg0.astype('float32'), axis=(0, 1))) patch = rescale_im_to_hu(vol0[:, :, largestSliceZ], dicom[0].RescaleIntercept, dicom[0].RescaleSlope) mask = seg0[:, :, largestSliceZ] entry = { 'patch': patch.astype(np.int16), 'info': (scan.patient_id, scan.study_instance_uid, scan.series_instance_uid, nod[annID]._nodule_id), 'nod_ids': [n._nodule_id for n in nod], 'rating': np.array([ann.feature_vals() for ann in nod]), 'mask': mask.astype(np.int16), 'z': z, 'size': getNoduleSize(nod) } dataset.append(entry) #gc.collect() else: pat_without_nod += 1 print("Prepared {} entries".format(len(dataset))) print("{} patients with nodules, {} patients without nodules".format( pat_with_nod, pat_without_nod)) if dump: pickle.dump(dataset, open(filename, 'wb')) print("Dumpted to {}".format(filename)) else: print("No Dump")
def extract_from_cluster_map(cluster_map, patch_size=144, res='Legacy'): dataset = [] for scan in pl.query(pl.Scan).all()[:]: # cycle 1018 scans # # Example for debuging: # scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == 'LIDC-IDRI-0004').first() # try: nods, cluster_indices = cluster_map[scan.id] except: continue print("Study ({}), Series({}) of patient {}:".format( scan.study_instance_uid, scan.series_instance_uid, scan.patient_id)) dicom = scan.load_all_dicom_images(verbose=False) for indices in cluster_indices: assert len(nods) > 0 nodules_in_cluster = np.concatenate([nods[i] for i in indices]) print("\tCluster with {} nodules.".format(len(nodules_in_cluster))) z_range = get_z_range(nodules_in_cluster) img_zs = [float(img.ImagePositionPatient[-1]) for img in dicom] assert (len(np.unique(img_zs)) == len(img_zs)) for z in filter(lambda x: (x <= z_range[1]) & (x >= z_range[0]), img_zs): image = getSlice(dicom, z, rescale=True) full_mask = np.zeros(image.shape).astype('bool') weights = [] ratings = [] nodule_ids = [] annotation_size = [] for nod in nodules_in_cluster: mask, bb, w = getMask(z, nod, img_zs, scan) if mask is None or 0 == w: # skip annotation continue full_mask[int(bb[0][0]):int(bb[0][1] + 1), int(bb[1][0]):int(bb[1][1] + 1)] |= mask nodule_ids += [nod._nodule_id] ratings += [nod.feature_vals()] assert (len(np.flatnonzero(mask)) > 0) annotation_size += [ calc_mask_size(mask, mm_per_px=scan.pixel_spacing) ] weights += [w] if 0 == np.count_nonzero(full_mask): # skips slice continue mask_size = calc_mask_size(full_mask, mm_per_px=scan.pixel_spacing) if type(res) is float: new_shape = tuple( (np.array(image.shape) * (scan.pixel_spacing / res)).astype('int')) image = transform.resize(image, output_shape=new_shape, order=1, preserve_range=True, mode='constant') full_mask = transform.resize(full_mask, output_shape=new_shape, order=0, preserve_range=True, mode='constant') if 0 == np.count_nonzero(full_mask): # sometimes the mask is pixel-wide, so after resize nothing is left # would've anyhow been filtered in later stages continue patch, mask = crop(image, full_mask, fix_size=patch_size, stdev=0) if np.abs(mask_size - calc_mask_size(mask, mm_per_px=res)) > res: print("{}, {}:\n\tfull mask size = {}\n\tmask size = {}". format(scan.patient_id, z, mask_size, calc_mask_size(mask, mm_per_px=res))) assert (patch.shape == (patch_size, patch_size)) assert (mask.shape == (patch_size, patch_size)) entry = { 'patch': patch.astype(np.int16), 'info': (scan.patient_id, scan.study_instance_uid, scan.series_instance_uid, nodule_ids), 'nod_ids': nodule_ids, 'rating': np.array(ratings), 'ann_size': np.array(annotation_size), 'weights': np.array(weights), 'mask': mask.astype(np.bool), 'z': z, 'size': mask_size } dataset.append(entry) print("Prepared {} entries".format(len(dataset))) return dataset
import pylidc as pl import numpy as np # test ann = pl.query(pl.Annotation).filter(pl.Annotation.texture == 1)[1] mask = ann.boolean_mask() vol = ann.scan.to_volume() # print (mask[363][343]) # print (mask.shape) print(vol[363][343][0]) print(vol.shape)
import pylidc as pl import dicom import pylab from numpy import shape import os import numpy query = pl.query(pl.Contour) print('Total Contours = ' + str(query.count())) def strip_leading_zeros(file_name): length = len(file_name) split_index = 0 i = 0 for i in range(length): if file_name[i] == '0': continue else: break return file_name[i:] qann = pl.query(pl.Annotation) for ann in qann: scan = ann.scan contours = ann.contours base_path = scan.get_path_to_dicom_files(checkpath=False) z_to_file_mapping = {} for filename in os.listdir(base_path):
import numpy as np import pylidc as pl scans = pl.query(pl.Scan) nscans = scans.count() for i,scan in enumerate(scans): print i+1,"/",nscans images = scan.load_all_dicom_images(verbose=0) img_zs = [float(img.ImagePositionPatient[-1]) for img in images] img_zs = np.unique(img_zs) for zval in img_zs: z = pl.Zval() z.val = float(zval) z.scan = scan pl._session.commit()
import numpy as np import pylidc #scan = pylidc.query(pylidc.Scan).filter(pylidc.Scan.patient_id == 'LIDC-IDRI-0340').first() # should be [4,4] scan = pylidc.query(pylidc.Scan).filter(pylidc.Scan.patient_id == 'LIDC-IDRI-0867').first() #print([len(a) for a in scan.cluster_annotations()]) print([len(a) for a in scan.cluster_annotations(metric='jaccard', tol=0.95, tol_limit=0.7)]) print(np.vstack([a.bbox()[0] for a in [scan.annotations[i] for i in [0,3,4,7]] ])) print(np.vstack([a.bbox()[1] for a in [scan.annotations[i] for i in [0,3,4,7]] ])) print(np.vstack([a.bbox()[2] for a in [scan.annotations[i] for i in [0,3,4,7]] ])) print('-'*10) print(np.vstack([a.bbox()[0] for a in [scan.annotations[i] for i in [1,2,5,6]] ])) print(np.vstack([a.bbox()[1] for a in [scan.annotations[i] for i in [1,2,5,6]] ])) print(np.vstack([a.bbox()[2] for a in [scan.annotations[i] for i in [1,2,5,6]] ]))
if length == 3: return contours mid_point = int(math.floor(length / 2)) mid_contours = [] for i in range(mid_point - 1, mid_point + 2): mid_contours.append(contours[i]) return mid_contours from sqlalchemy import and_ annotations = pl.query(pl.Annotation).filter( and_(pl.Annotation.id >= 39, pl.Annotation.id <= 39)) # Fetch and process all the annotation data there is in the system annotations = pl.query(pl.Annotation) # annotations_count = annotations.count() qualified_ann_count = 0 max_xrange = 0 max_yrange = 0 min_xrange = 100000 min_yrange = 100000 training_data = [] target_data = [] for ann in annotations:
f'{out_path}{subset_series_ids[jj]}/lungs_segmented/lungs_segmented.npz', numpyImage_segmented) # go through all candidates that are in this image # sort to make sure we have all the trues (for prototyping only) curr_cands = curr_cands.sort_values( 'class', ascending=False).reset_index(drop=True) # Added in v2 one_segmentation_consensus = np.zeros_like(numpyImage) one_segmentation_maxvol = np.zeros_like(numpyImage) labelledNods = np.zeros_like(numpyImage) # query the LIDC images HERE WE JUST USE THE FIRST ONE!! idx_scan = 0 scan = pl.query(pl.Scan).filter( pl.Scan.series_instance_uid == subset_series_ids[jj])[idx_scan] nods = scan.cluster_annotations( ) # get the annotations for all nodules in this scan #print(np.shape(nods)) #Get all the nodules (class==1) curr_cands_class1 = curr_cands.loc[curr_cands['class'] == 1] for i_curr_cand in range(len(curr_cands_class1)): curr_cand = curr_cands_class1.iloc[i_curr_cand] # first need to find the corresponding column in the annotations csv (assuming its the closest # nodule to the current candidate) # extract the annotations for the scan id of our current candidate annotations_scan_df = annotations_df.loc[ annotations_df['seriesuid'] == curr_cand['seriesuid']]
import pylidc as pl import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D # noqa: F401 unused import from sklearn.cluster import SpectralClustering from sklearn.decomposition import PCA from scipy.spatial.distance import pdist, cdist, squareform ratings = np.array([ann.feature_vals() for ann in pl.query(pl.Annotation).all()]) projection_3d = PCA(n_components=3).fit_transform(ratings) distance_matrix = squareform(pdist(ratings, 'euclidean')) affinity_matrix = np.exp(- distance_matrix / distance_matrix.std()) ## # SELECT N OF CLUSTERS ## n_clusters = np.arange(16, 1025, 16) n_clusters_scores = list() for k in n_clusters: sc = SpectralClustering(64, affinity='precomputed', assign_labels='kmeans', n_init=100) clusters = sc.fit_predict(affinity_matrix) cluster_score = list() for label in np.unique(clusters): cluster_mask = (label == clusters) out_of_cluster_mask = np.logical_not(cluster_mask) interclass_scores = cdist(projection_3d[cluster_mask, :], projection_3d[out_of_cluster_mask, :], 'euclidean').min(axis=1).mean() inclass_scores = pdist(projection_3d[cluster_mask, :], 'euclidean').mean() if inclass_scores < 1e-3:
def get_scan(self, annotation): return (pl.query(pl.Scan).filter(pl.Scan.id == annotation.scan_id)).all()
return contours mid_point = int(math.floor(length / 2)) mid_contours = [] for i in range(mid_point - 1, mid_point + 2): mid_contours.append(contours[i]) return mid_contours from sqlalchemy import and_ # annotations = pl.query(pl.Annotation).filter(and_(pl.Annotation.id >= 4640, pl.Annotation.id <= 4641)) # Fetch and process all the annotation data there is in the system annotations = pl.query(pl.Annotation) annotations_count = annotations.count() qualified_ann_count = 0 max_xrange = 0 max_yrange = 0 min_xrange = 100000 min_yrange = 100000 training_data = [] target_data = [] for ann in annotations: ann_id = str(ann.id) ann_id = ann_id.rjust(8, ' ')
def makeCompositeObjects(self, subjectID): # convert all segmentations and measurements into composite objects # 1. find all segmentations # 2. read all, append metadata # 3. find all measurements # 4. read all, append metadata import re s = 'LIDC-IDRI-%04i' % subjectID self.logger.info("Making composite objects for " + s) scans = pl.query(pl.Scan).filter(pl.Scan.patient_id == s) self.logger.info(" Found %d scans" % (scans.count())) # cannot just take all segmentation files in a folder, since for scan in scans: studyUID = scan.study_instance_uid seriesUID = scan.series_instance_uid seriesDir = os.path.join(self.rootDir, s, studyUID, seriesUID) if not os.path.exists(seriesDir): self.logger.error("Files not found for subject " + s) return dcmFiles = glob.glob(os.path.join(seriesDir, "*.dcm")) if not len(dcmFiles): logger.error("No DICOM files found for subject " + s) return firstFile = os.path.join(seriesDir, dcmFiles[0]) try: ctDCM = pydicom.read_file(firstFile) except: logger.error("Failed to read input file " + firstFile) return self.instanceCount = 1000 subjectScanTempDir = os.path.join(self.tempDir, s, studyUID, seriesUID) allSegmentations = glob.glob( os.path.join(subjectScanTempDir, 'Nodule*Annotation*.nrrd')) if not len(allSegmentations): continue segMetadata = {} nrrdSegFileList = "" srMetadata = {} for segID, seg in enumerate(allSegmentations): prefix = seg[:-5] matches = re.match('Nodule (\d+) - Annotation (.+)\.', os.path.split(seg)[1]) print("Nodule: " + matches.group(1) + " Annotation: " + matches.group(2)) if not segMetadata: segMetadata = json.load(open(prefix + ".json")) else: thisSegMetadata = json.load(open(prefix + ".json")) segMetadata["segmentAttributes"].append( thisSegMetadata["segmentAttributes"][0]) if not srMetadata: srMetadata = json.load(open(prefix + " measurements.json")) else: thisSRMetadata = json.load( open(prefix + " measurements.json")) thisSRMetadata["Measurements"][0][ "ReferencedSegment"] = segID + 1 srMetadata["Measurements"].append( thisSRMetadata["Measurements"][0]) nrrdSegFileList = nrrdSegFileList + seg + "," segMetadata[ "ContentDescription"] = "Lung nodule segmentation - all" segMetadata["SeriesDescription"] = "Segmentations of all nodules" segMetadata["SeriesNumber"] = str( int(ctDCM.SeriesNumber if ctDCM.SeriesNumber else 0) + self.instanceCount) self.instanceCount = self.instanceCount + 1 # run SEG converter allSegsJSON = os.path.join(subjectScanTempDir, "all_segmentations.json") with open(allSegsJSON, "w") as f: json.dump(segMetadata, f, indent=2) compositeSEGFileName = os.path.join(subjectScanTempDir, "all_segmentations.dcm") nrrdSegFileList = nrrdSegFileList[:-1] converterCmd = [ 'itkimage2segimage', "--inputImageList", nrrdSegFileList, "--inputDICOMDirectory", seriesDir, "--inputMetadata", allSegsJSON, "--outputDICOM", compositeSEGFileName ] if self.args.skip: converterCmd.append('--skip') self.logger.info("Converting to DICOM SEG with " + str(converterCmd)) sp = subprocess.Popen(converterCmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = sp.communicate() self.logger.info("itkimage2segimage stdout: " + stdout.decode('ascii')) self.logger.warning("itkimage2segimage stderr: " + stderr.decode('ascii')) if not os.path.exists(compositeSEGFileName): self.logger.error( "Failed to access output composite SEG file for " + s) # populate composite SR JSON # need SEG SOPInstnaceUID for that purpose segDcm = pydicom.read_file(compositeSEGFileName) segUID = segDcm.SOPInstanceUID ctSeriesUID = segDcm.ReferencedSeriesSequence[0].SeriesInstanceUID for mItem in range(len(srMetadata["Measurements"])): srMetadata["Measurements"][mItem][ "segmentationSOPInstanceUID"] = segUID srMetadata["compositeContext"] = [ os.path.split(compositeSEGFileName)[1] ] srMetadata["ContentDescription"] = "Lung nodule measurements - all" srMetadata["SeriesDescription"] = "Evaluations for all nodules" srMetadata["SeriesNumber"] = str( int(ctDCM.SeriesNumber) + self.instanceCount) self.instanceCount = self.instanceCount + 1 allSrsJSON = os.path.join(subjectScanTempDir, "all_measurements.json") with open(allSrsJSON, "w") as f: json.dump(srMetadata, f, indent=2) compositeSRFileName = os.path.join(subjectScanTempDir, "all_measurements.dcm") nrrdSegFileList = nrrdSegFileList[:-1] converterCmd = [ 'tid1500writer', "--inputMetadata", allSrsJSON, "--inputImageLibraryDirectory", seriesDir, "--inputCompositeContextDirectory", subjectScanTempDir, "--outputDICOM", compositeSRFileName ] self.logger.info("Converting to DICOM SR with " + str(converterCmd)) sp = subprocess.Popen(converterCmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = sp.communicate() self.logger.info("tid1500writer stdout: " + stdout.decode('ascii')) self.logger.warning("tid1500writer stderr: " + stderr.decode('ascii')) if not os.path.exists(compositeSRFileName): self.logger.error( "Failed to access output composite SR file for " + s)
def get_scan_from_ann(ann): scan_ann = pl.query(pl.Scan).filter(pl.Scan.id == ann.scan_id) return scan_ann
def convertForSubject(self, subjectID): s = 'LIDC-IDRI-%04i' % subjectID self.logger.info("Processing subject %s" % (s)) scans = pl.query(pl.Scan).filter(pl.Scan.patient_id == s) self.logger.info(" Found %d scans" % (scans.count())) for scan in scans: studyUID = scan.study_instance_uid seriesUID = scan.series_instance_uid seriesDir = os.path.join(self.rootDir, s, studyUID, seriesUID) if not os.path.exists(seriesDir): self.logger.error("Files not found for subject " + s) return dcmFiles = glob.glob(os.path.join(seriesDir, "*.dcm")) if not len(dcmFiles): logger.error("No DICOM files found for subject " + s) return firstFile = os.path.join(seriesDir, dcmFiles[0]) try: ctDCM = pydicom.read_file(firstFile) except: logger.error("Failed to read input file " + firstFile) return ok = lidc_helpers.checkSeriesGeometry(seriesDir) if not ok: self.logger.warning("Geometry inconsistent for subject %s" % (s)) self.tempSubjectDir = os.path.join(self.tempDir, s) reconTempDir = os.path.join(self.tempSubjectDir, "dicom2nrrd") try: os.makedirs(reconTempDir) except: pass scanNRRDFile = os.path.join(self.tempSubjectDir, s + '_CT.nrrd') if not os.path.exists(scanNRRDFile): # convert # tempDir = tempfile.mkdtemp() plastimatchCmd = [ '/Users/fedorov/build/plastimatch/plastimatch', 'convert', '--input', seriesDir, '--output-img', scanNRRDFile ] self.logger.info("Running plastimatch with " + str(plastimatchCmd)) sp = subprocess.Popen(plastimatchCmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) (stdout, stderr) = sp.communicate() self.logger.info("plastimatch stdout: " + stdout.decode('ascii')) self.logger.warning("plastimatch stderr: " + stderr.decode('ascii')) self.logger.info('plastimatch completed') self.logger.info("Conversion of CT volume OK - result in " + scanNRRDFile) else: self.logger.info( scanNRRDFile + " exists. Not rerunning volume reconstruction.") reader = itk.ImageFileReader[itk.Image[itk.SS, 3]].New() reader.SetFileName(scanNRRDFile) reader.Update() volume = reader.GetOutput() #logger.info(volume.GetLargestPossibleRegion().GetSize()) # now iterate over all nodules available for this subject anns = scan.annotations self.logger.info("Have %d annotations for subject %s" % (len(anns), s)) self.instanceCount = 0 clusteredAnnotationIDs = [] for nCount, nodule in enumerate(scan.cluster_annotations()): noduleUID = pydicom.uid.generate_uid( prefix=None) # by default, pydicom uses 2.25 root for aCount, a in enumerate(nodule): clusteredAnnotationIDs.append(a.id) self.convertSingleAnnotation(nCount, aCount, a, ctDCM, noduleUID, volume, seriesDir) if len(clusteredAnnotationIDs) != len(anns): self.logger.warning("%d annotations unaccounted for!" % (len(anns) - len(clusteredAnnotationIDs))) for ua in anns: if ua.id not in clusteredAnnotationIDs: aCount = aCount + 1 nCount = nCount + 1 noduleUID = pydicom.uid.generate_uid(prefix=None) self.convertSingleAnnotation(nCount, aCount, ua, ctDCM, noduleUID, volume, seriesDir) self.cleanUpTempDir(self.tempSubjectDir)
def get_ann_from_scan(): for i in range(len(pl.query(pl.Scan).all())): scan = pl.query(pl.Scan)[i] nodules = scan.cluster_annotations() print(nodules) return nodules
import pylidc as pl ann = pl.query(pl.Annotation).first() print(ann.scan.patient_id) anns = pl.query(pl.Annotation).filter(pl.Annotation.spiculation == 5, pl.Annotation.malignancy == 5) # print(anns) ann = pl.query(pl.Annotation)\ .filter(pl.Annotation.malignancy==4).first() print(ann.malignancy, ann.Malignancy) print(ann.margin, ann.Margin) ann.print_formatted_feature_table()
def get_visual(i): ann = pl.query(pl.Annotation).filter(pl.Annotation.texture == 1)[i] scan = pl.query(pl.Scan).filter(pl.Scan.id == ann.scan_id)[i] scan.visualize(annotation_groups=scan.cluster_annotations())
image = np.stack([s.pixel_array for s in slices]) image = image.astype(np.int16) image[image == -2000] = 0 for slice_number in range(len(slices)): intercept = slices[slice_number].RescaleIntercept slope = slices[slice_number].RescaleSlope if slope != 1: image[slice_number] = slope * image[slice_number].astype( np.float64) image[slice_number] = image[slice_number].astype(np.int16) image[slice_number] += np.int16(intercept) return np.array(image, dtype=np.int16) all_scans = pl.query(pl.Scan) num_scans = all_scans.count() def save_slices(save_path, id_range=[0, num_scans], norm_range=np.array([[-1000, 200], [-250, 200], [-1000, -745]])): # Save slices as png with 3 channels # Create a table to save the corresponding origin path of the saved npys. with open('names.csv', 'w') as f: writer = csv.writer(f) writer.writerow(['short_name', 'origin_path']) with open('miss.csv', 'w') as f:
def get_scan_data(): for i in range(len(pl.query(pl.Scan).all())): scans = pl.query(pl.Scan)[i] return scans
def prepare_dataset(self): # This is to name each image and mask prefix = [str(x).zfill(3) for x in range(1000)] # Make directory if not os.path.exists(self.img_path): os.makedirs(self.img_path) if not os.path.exists(self.mask_path): os.makedirs(self.mask_path) if not os.path.exists(self.clean_path_img): os.makedirs(self.clean_path_img) if not os.path.exists(self.clean_path_mask): os.makedirs(self.clean_path_mask) if not os.path.exists(self.meta_path): os.makedirs(self.meta_path) IMAGE_DIR = Path(self.img_path) MASK_DIR = Path(self.mask_path) CLEAN_DIR_IMAGE = Path(self.clean_path_img) CLEAN_DIR_MASK = Path(self.clean_path_mask) for patient in tqdm(self.IDRI_list): pid = patient #LIDC-IDRI-0001~ scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first() nodules_annotation = scan.cluster_annotations() vol = scan.to_volume() print( "Patient ID: {} Dicom Shape: {} Number of Annotated Nodules: {}" .format(pid, vol.shape, len(nodules_annotation))) patient_image_dir = IMAGE_DIR / pid patient_mask_dir = MASK_DIR / pid Path(patient_image_dir).mkdir(parents=True, exist_ok=True) Path(patient_mask_dir).mkdir(parents=True, exist_ok=True) if len(nodules_annotation) > 0: # Patients with nodules for nodule_idx, nodule in enumerate(nodules_annotation): # Call nodule images. Each Patient will have at maximum 4 annotations as there are only 4 doctors # This current for loop iterates over total number of nodules in a single patient mask, cbbox, masks = consensus(nodule, self.c_level, self.padding) lung_np_array = vol[cbbox] # We calculate the malignancy information malignancy, cancer_label = self.calculate_malignancy( nodule) for nodule_slice in range(mask.shape[2]): # This second for loop iterates over each single nodule. # There are some mask sizes that are too small. These may hinder training. if np.sum(mask[:, :, nodule_slice]) <= self.mask_threshold: continue # Segment Lung part only lung_segmented_np_array = segment_lung( lung_np_array[:, :, nodule_slice]) # I am not sure why but some values are stored as -0. <- this may result in datatype error in pytorch training # Not sure lung_segmented_np_array[lung_segmented_np_array == -0] = 0 # This itereates through the slices of a single nodule # Naming of each file: NI= Nodule Image, MA= Mask Original nodule_name = "{}_NI{}_slice{}".format( pid[-4:], prefix[nodule_idx], prefix[nodule_slice]) mask_name = "{}_MA{}_slice{}".format( pid[-4:], prefix[nodule_idx], prefix[nodule_slice]) meta_list = [ pid[-4:], nodule_idx, prefix[nodule_slice], nodule_name, mask_name, malignancy, cancer_label, False ] self.save_meta(meta_list) np.save(patient_image_dir / nodule_name, lung_segmented_np_array) np.save(patient_mask_dir / mask_name, mask[:, :, nodule_slice]) else: print("Clean Dataset", pid) patient_clean_dir_image = CLEAN_DIR_IMAGE / pid patient_clean_dir_mask = CLEAN_DIR_MASK / pid Path(patient_clean_dir_image).mkdir(parents=True, exist_ok=True) Path(patient_clean_dir_mask).mkdir(parents=True, exist_ok=True) #There are patients that don't have nodule at all. Meaning, its a clean dataset. We need to use this for validation for slice in range(vol.shape[2]): if slice > 50: break lung_segmented_np_array = segment_lung(vol[:, :, slice]) lung_segmented_np_array[lung_segmented_np_array == -0] = 0 lung_mask = np.zeros_like(lung_segmented_np_array) #CN= CleanNodule, CM = CleanMask nodule_name = "{}/{}_CN001_slice{}".format( pid, pid[-4:], prefix[slice]) mask_name = "{}/{}_CM001_slice{}".format( pid, pid[-4:], prefix[slice]) meta_list = [ pid[-4:], slice, prefix[slice], nodule_name, mask_name, 0, False, True ] self.save_meta(meta_list) np.save(patient_clean_dir_image / nodule_name, lung_segmented_np_array) np.save(patient_clean_dir_mask / mask_name, lung_mask) print("Saved Meta data") self.meta.to_csv(self.meta_path + 'meta_info.csv', index=False)
def get_scan(name): scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == name).first() return scan