def test_getfbounds(): good_inputs = range(1, 12) bad_inputs = [(0, ValueError), (12, ValueError), (10.5, TypeError)] for n_echos in good_inputs: utils.getfbounds(n_echos) for (n_echos, err) in bad_inputs: with pytest.raises(err): utils.getfbounds(n_echos)
def test_getfbounds(): good_inputs = range(1, 12) bad_inputs = [ (0, ValueError), (12, ValueError), (10.5, TypeError) ] for n_echos in good_inputs: utils.getfbounds(n_echos) for (n_echos, err) in bad_inputs: with pytest.raises(err): utils.getfbounds(n_echos)
def selcomps(seldict, mmix, mask, ref_img, manacc, n_echos, t2s, s0, olevel=2, oversion=99, filecsdata=True, savecsdiag=True, strict_mode=False): """ Labels components in `mmix` Parameters ---------- seldict : :obj:`dict` As output from `fitmodels_direct` mmix : (C x T) array_like Mixing matrix for converting input data to component space, where `C` is components and `T` is the number of volumes in the original data mask : (S,) array_like Boolean mask array ref_img : str or img_like Reference image to dictate how outputs are saved to disk manacc : list Comma-separated list of indices of manually accepted components n_echos : int Number of echos in original data t2s : (S,) array_like s0 : (S,) array_like olevel : int, optional Default: 2 oversion : int, optional Default: 99 filecsdata: bool, optional Default: False savecsdiag: bool, optional Default: True strict_mode: bool, optional Default: False Returns ------- acc : list Indices of accepted (BOLD) components in `mmix` rej : list Indices of rejected (non-BOLD) components in `mmix` midk : list Indices of mid-K (questionable) components in `mmix` ign : list Indices of ignored components in `mmix` """ if filecsdata: import bz2 if seldict is not None: LGR.info('Saving component selection data') with bz2.BZ2File('compseldata.pklbz', 'wb') as csstate_f: pickle.dump(seldict, csstate_f) else: try: with bz2.BZ2File('compseldata.pklbz', 'rb') as csstate_f: seldict = pickle.load(csstate_f) except FileNotFoundError: LGR.warning('Failed to load component selection data') return None # List of components midk = [] ign = [] nc = np.arange(len(seldict['Kappas'])) ncl = np.arange(len(seldict['Kappas'])) # If user has specified components to accept manually if manacc: acc = sorted([int(vv) for vv in manacc.split(',')]) midk = [] rej = sorted(np.setdiff1d(ncl, acc)) return acc, rej, midk, [] # Add string for ign """ Do some tallies for no. of significant voxels """ countsigFS0 = seldict['F_S0_clmaps'].sum(0) countsigFR2 = seldict['F_R2_clmaps'].sum(0) countnoise = np.zeros(len(nc)) """ Make table of dice values """ dice_tbl = np.zeros([nc.shape[0], 2]) for ii in ncl: dice_FR2 = utils.dice( utils.unmask(seldict['Br_clmaps_R2'][:, ii], mask)[t2s != 0], seldict['F_R2_clmaps'][:, ii]) dice_FS0 = utils.dice( utils.unmask(seldict['Br_clmaps_S0'][:, ii], mask)[t2s != 0], seldict['F_S0_clmaps'][:, ii]) dice_tbl[ii, :] = [dice_FR2, dice_FS0] # step 3a here and above dice_tbl[np.isnan(dice_tbl)] = 0 """ Make table of noise gain """ tt_table = np.zeros([len(nc), 4]) counts_FR2_Z = np.zeros([len(nc), 2]) for ii in nc: comp_noise_sel = utils.andb([ np.abs(seldict['Z_maps'][:, ii]) > 1.95, seldict['Z_clmaps'][:, ii] == 0 ]) == 2 countnoise[ii] = np.array(comp_noise_sel, dtype=np.int).sum() noise_FR2_Z_mask = utils.unmask(comp_noise_sel, mask)[t2s != 0] noise_FR2_Z = np.log10( np.unique(seldict['F_R2_maps'][noise_FR2_Z_mask, ii])) signal_FR2_Z_mask = utils.unmask(seldict['Z_clmaps'][:, ii], mask)[t2s != 0] == 1 signal_FR2_Z = np.log10( np.unique(seldict['F_R2_maps'][signal_FR2_Z_mask, ii])) counts_FR2_Z[ii, :] = [len(signal_FR2_Z), len(noise_FR2_Z)] ttest = stats.ttest_ind(signal_FR2_Z, noise_FR2_Z, equal_var=True) # avoid DivideByZero RuntimeWarning if signal_FR2_Z.size > 0 and noise_FR2_Z.size > 0: mwu = stats.norm.ppf( stats.mannwhitneyu(signal_FR2_Z, noise_FR2_Z)[1]) else: mwu = -np.inf tt_table[ii, 0] = np.abs(mwu) * ttest[0] / np.abs(ttest[0]) tt_table[ii, 1] = ttest[1] tt_table[np.isnan(tt_table)] = 0 tt_table[np.isinf(tt_table[:, 0]), 0] = np.percentile(tt_table[~np.isinf(tt_table[:, 0]), 0], 98) # Time series derivative kurtosis mmix_dt = (mmix[:-1] - mmix[1:]) mmix_kurt = stats.kurtosis(mmix_dt) mmix_std = np.std(mmix_dt, axis=0) """ Step 1: Reject anything that's obviously an artifact a. Estimate a null variance """ LGR.debug( 'Rejecting gross artifacts based on Rho/Kappa values and S0/R2 counts') rej = ncl[utils.andb( [seldict['Rhos'] > seldict['Kappas'], countsigFS0 > countsigFR2]) > 0] ncl = np.setdiff1d(ncl, rej) """ Step 2: Compute 3-D spatial FFT of Beta maps to detect high-spatial frequency artifacts """ LGR.debug( 'Computing 3D spatial FFT of beta maps to detect high-spatial frequency artifacts' ) # spatial information is important so for NIFTI we convert back to 3D space if utils.get_dtype(ref_img) == 'NIFTI': dim1 = np.prod(check_niimg(ref_img).shape[:2]) else: dim1 = mask.shape[0] fproj_arr = np.zeros([dim1, len(nc)]) fproj_arr_val = np.zeros([dim1, len(nc)]) spr = [] fdist = [] for ii in nc: # convert data back to 3D array if utils.get_dtype(ref_img) == 'NIFTI': tproj = utils.new_nii_like( ref_img, utils.unmask(seldict['PSC'], mask)[:, ii]).get_data() else: tproj = utils.unmask(seldict['PSC'], mask)[:, ii] fproj = np.fft.fftshift(np.abs(np.fft.rfftn(tproj))) fproj_z = fproj.max(axis=-1) fproj[fproj == fproj.max()] = 0 spr.append(np.array(fproj_z > fproj_z.max() / 4, dtype=np.int).sum()) fproj_arr[:, ii] = stats.rankdata(fproj_z.flatten()) fproj_arr_val[:, ii] = fproj_z.flatten() if utils.get_dtype(ref_img) == 'NIFTI': fprojr = np.array([fproj, fproj[:, :, ::-1]]).max(0) fdist.append( np.max([ utils.fitgaussian(fproj.max(jj))[3:].max() for jj in range(fprojr.ndim) ])) else: fdist = np.load(os.path.join(RESOURCES, 'fdist.npy')) if type(fdist) is not np.ndarray: fdist = np.array(fdist) spr = np.array(spr) # import ipdb; ipdb.set_trace() """ Step 3: Create feature space of component properties """ LGR.debug('Creating feature space of component properties') fdist_pre = fdist.copy() fdist_pre[fdist > np.median(fdist) * 3] = np.median(fdist) * 3 fdist_z = (fdist_pre - np.median(fdist_pre)) / fdist_pre.std() spz = (spr - spr.mean()) / spr.std() Tz = (tt_table[:, 0] - tt_table[:, 0].mean()) / tt_table[:, 0].std() varex_ = np.log(seldict['varex']) Vz = (varex_ - varex_.mean()) / varex_.std() Rz = (seldict['Rhos'] - seldict['Rhos'].mean()) / seldict['Rhos'].std() Ktz = np.log(seldict['Kappas']) / 2 Ktz = (Ktz - Ktz.mean()) / Ktz.std() Rtz = np.log(seldict['Rhos']) / 2 Rtz = (Rtz - Rtz.mean()) / Rtz.std() KRr = stats.zscore(np.log(seldict['Kappas']) / np.log(seldict['Rhos'])) cnz = (countnoise - countnoise.mean()) / countnoise.std() Dz = stats.zscore(np.arctanh(dice_tbl[:, 0] + 0.001)) fz = np.array([Tz, Vz, Ktz, KRr, cnz, Rz, mmix_kurt, fdist_z]) """ Step 3: Make initial guess of where BOLD components are and use DBSCAN to exclude noise components and find a sample set of 'good' components """ LGR.debug('Making initial guess of BOLD components') # epsmap is [index,level of overlap with dicemask, # number of high Rho components] F05, F025, F01 = utils.getfbounds(n_echos) epsmap = [] Rhos_sorted = np.array(sorted(seldict['Rhos']))[::-1] # Make an initial guess as to number of good components based on # consensus of control points across Rhos and Kappas KRcutguesses = [ getelbow_mod(seldict['Rhos']), getelbow_cons(seldict['Rhos']), getelbow_aggr(seldict['Rhos']), getelbow_mod(seldict['Kappas']), getelbow_cons(seldict['Kappas']), getelbow_aggr(seldict['Kappas']) ] Khighelbowval = stats.scoreatpercentile([ getelbow_mod(seldict['Kappas'], val=True), getelbow_cons(seldict['Kappas'], val=True), getelbow_aggr(seldict['Kappas'], val=True) ] + list(utils.getfbounds(n_echos)), 75, interpolation_method='lower') KRcut = np.median(KRcutguesses) # only use exclusive when inclusive is extremely inclusive - double KRcut cond1 = getelbow_cons(seldict['Kappas']) > KRcut * 2 cond2 = getelbow_mod(seldict['Kappas'], val=True) < F01 if cond1 and cond2: Kcut = getelbow_mod(seldict['Kappas'], val=True) else: Kcut = getelbow_cons(seldict['Kappas'], val=True) # only use inclusive when exclusive is extremely exclusive - half KRcut # (remember for Rho inclusive is higher, so want both Kappa and Rho # to defaut to lower) if getelbow_cons(seldict['Rhos']) > KRcut * 2: Rcut = getelbow_mod(seldict['Rhos'], val=True) # for above, consider something like: # min([getelbow_mod(Rhos,True),sorted(Rhos)[::-1][KRguess] ]) else: Rcut = getelbow_cons(seldict['Rhos'], val=True) if Rcut > Kcut: Kcut = Rcut # Rcut should never be higher than Kcut KRelbow = utils.andb([seldict['Kappas'] > Kcut, seldict['Rhos'] < Rcut]) # Make guess of Kundu et al 2011 plus remove high frequencies, # generally high variance, and high variance given low Kappa tt_lim = stats.scoreatpercentile( tt_table[tt_table[:, 0] > 0, 0], 75, interpolation_method='lower') / 3 KRguess = np.setdiff1d( np.setdiff1d(nc[KRelbow == 2], rej), np.union1d( nc[tt_table[:, 0] < tt_lim], np.union1d( np.union1d(nc[spz > 1], nc[Vz > 2]), nc[utils.andb([ seldict['varex'] > 0.5 * sorted(seldict['varex'])[::-1][int(KRcut)], seldict['Kappas'] < 2 * Kcut ]) == 2]))) guessmask = np.zeros(len(nc)) guessmask[KRguess] = 1 # Throw lower-risk bad components out rejB = ncl[utils.andb([ tt_table[ncl, 0] < 0, seldict['varex'][ncl] > np.median(seldict['varex']), ncl > KRcut ]) == 3] rej = np.union1d(rej, rejB) ncl = np.setdiff1d(ncl, rej) LGR.debug('Using DBSCAN to find optimal set of "good" BOLD components') for ii in range(20000): eps = .005 + ii * .005 db = DBSCAN(eps=eps, min_samples=3).fit(fz.T) # it would be great to have descriptive names, here # DBSCAN found at least three non-noisy clusters cond1 = db.labels_.max() > 1 # DBSCAN didn't detect more classes than the total # of components / 6 cond2 = db.labels_.max() < len(nc) / 6 # TODO: confirm if 0 is a special label for DBSCAN # my intuition here is that we're confirming DBSCAN labelled previously # rejected components as noise (i.e., no overlap between `rej` and # labelled DBSCAN components) cond3 = np.intersect1d(rej, nc[db.labels_ == 0]).shape[0] == 0 # DBSCAN labelled less than half of the total components as noisy cond4 = np.array(db.labels_ == -1, dtype=int).sum() / float( len(nc)) < .5 if cond1 and cond2 and cond3 and cond4: epsmap.append([ ii, utils.dice(guessmask, db.labels_ == 0), np.intersect1d( nc[db.labels_ == 0], nc[seldict['Rhos'] > getelbow_mod(Rhos_sorted, val=True)]). shape[0] ]) db = None epsmap = np.array(epsmap) LGR.debug('Found DBSCAN solutions for {}/20000 eps resolutions'.format( len(epsmap))) group0 = [] dbscanfailed = False if len(epsmap) != 0: # Select index that maximizes Dice with guessmask but first # minimizes number of higher Rho components ii = int( epsmap[np.argmax(epsmap[epsmap[:, 2] == np.min(epsmap[:, 2]), 1], 0), 0]) LGR.debug('Component selection tuning: {:.05f}'.format( epsmap[:, 1].max())) db = DBSCAN(eps=.005 + ii * .005, min_samples=3).fit(fz.T) ncl = nc[db.labels_ == 0] ncl = np.setdiff1d(ncl, rej) ncl = np.setdiff1d(ncl, ncl[ncl > len(nc) - len(rej)]) group0 = ncl.copy() group_n1 = nc[db.labels_ == -1] to_clf = np.setdiff1d(nc, np.union1d(ncl, rej)) if len(group0) == 0 or len(group0) < len(KRguess) * .5: dbscanfailed = True LGR.debug('DBSCAN guess failed; using elbow guess method instead') ncl = np.setdiff1d( np.setdiff1d(nc[KRelbow == 2], rej), np.union1d( nc[tt_table[:, 0] < tt_lim], np.union1d( np.union1d(nc[spz > 1], nc[Vz > 2]), nc[utils.andb([ seldict['varex'] > 0.5 * sorted(seldict['varex'])[::-1][int(KRcut)], seldict['Kappas'] < 2 * Kcut ]) == 2]))) group0 = ncl.copy() group_n1 = [] to_clf = np.setdiff1d(nc, np.union1d(group0, rej)) if len(group0) < 2 or (len(group0) < 4 and float(len(rej)) / len(group0) > 3): LGR.warning('Extremely limited reliable BOLD signal space! ' 'Not filtering components beyond BOLD/non-BOLD guesses.') midkfailed = True min_acc = np.array([]) if len(group0) != 0: # For extremes, building in a 20% tolerance toacc_hi = np.setdiff1d( nc[utils.andb([ fdist <= np.max(fdist[group0]), seldict['Rhos'] < F025, Vz > -2 ]) == 3], np.union1d(group0, rej)) min_acc = np.union1d(group0, toacc_hi) to_clf = np.setdiff1d(nc, np.union1d(min_acc, rej)) else: toacc_hi = [] min_acc = [] diagstep_keys = [ 'Rejected components', 'Kappa-Rho cut point', 'Kappa cut point', 'Rho cut point', 'DBSCAN failed to converge', 'Mid-Kappa failed (limited BOLD signal)', 'Kappa-Rho guess', 'min_acc', 'toacc_hi' ] diagstep_vals = [ list(rej), KRcut, Kcut, Rcut, dbscanfailed, midkfailed, list(KRguess), list(min_acc), list(toacc_hi) ] with open('csstepdata.json', 'w') as ofh: json.dump(dict(zip(diagstep_keys, diagstep_vals)), ofh, indent=4, sort_keys=True, default=str) return list(sorted(min_acc)), list(sorted(rej)), [], list( sorted(to_clf)) # Find additional components to reject based on Dice - doing this here # since Dice is a little unstable, need to reference group0 rej_supp = [] dice_rej = False if not dbscanfailed and len(rej) + len(group0) < 0.75 * len(nc): dice_rej = True rej_supp = np.setdiff1d( np.setdiff1d( np.union1d(rej, nc[dice_tbl[nc, 0] <= dice_tbl[nc, 1]]), group0), group_n1) rej = np.union1d(rej, rej_supp) # Temporal features # larger is worse - spike mmix_kurt_z = (mmix_kurt - mmix_kurt[group0].mean()) / mmix_kurt[group0].std() # smaller is worse - drift mmix_std_z = -1 * ( (mmix_std - mmix_std[group0].mean()) / mmix_std[group0].std()) mmix_kurt_z_max = np.max([mmix_kurt_z, mmix_std_z], 0) """ Step 2: Classifiy midk and ignore using separte SVMs for different variance regimes # To render hyperplane: min_x = np.min(spz2);max_x=np.max(spz2) # plotting separating hyperplane ww = clf_.coef_[0] aa = -ww[0] / ww[1] # make sure the next line is long enough xx = np.linspace(min_x - 2, max_x + 2) yy = aa * xx - (clf_.intercept_[0]) / ww[1] plt.plot(xx, yy, '-') """ LGR.debug('Attempting to classify midk components') # Tried getting rid of accepting based on SVM altogether, # now using only rejecting toacc_hi = np.setdiff1d( nc[utils.andb([ fdist <= np.max(fdist[group0]), seldict['Rhos'] < F025, Vz > -2 ]) == 3], np.union1d(group0, rej)) toacc_lo = np.intersect1d( to_clf, nc[utils.andb([ spz < 1, Rz < 0, mmix_kurt_z_max < 5, Dz > -1, Tz > -1, Vz < 0, seldict['Kappas'] >= F025, fdist < 3 * np.percentile(fdist[group0], 98) ]) == 8]) midk_clf, clf_ = do_svm(fproj_arr_val[:, np.union1d(group0, rej)].T, [0] * len(group0) + [1] * len(rej), fproj_arr_val[:, to_clf].T, svmtype=2) midk = np.setdiff1d( to_clf[utils.andb([ midk_clf == 1, seldict['varex'][to_clf] > np.median(seldict['varex'][group0]) ]) == 2], np.union1d(toacc_hi, toacc_lo)) # only use SVM to augment toacc_hi only if toacc_hi isn't already # conflicting with SVM choice if len( np.intersect1d( to_clf[utils.andb([midk_clf == 1, Vz[to_clf] > 0]) == 2], toacc_hi)) == 0: svm_acc_fail = True toacc_hi = np.union1d(toacc_hi, to_clf[midk_clf == 0]) else: svm_acc_fail = False """ Step 3: Compute variance associated with low T2* areas (e.g. draining veins and low T2* areas) # To write out veinmask veinout = np.zeros(t2s.shape) veinout[t2s!=0] = veinmaskf utils.filewrite(veinout, 'veinmaskf', ref_img) veinBout = utils.unmask(veinmaskB, mask) utils.filewrite(veinBout, 'veins50', ref_img) """ LGR.debug( 'Computing variance associated with low T2* areas (e.g., draining veins)' ) tsoc_B_Zcl = np.zeros(seldict['tsoc_B'].shape) tsoc_B_Zcl[seldict['Z_clmaps'] != 0] = np.abs( seldict['tsoc_B'])[seldict['Z_clmaps'] != 0] sig_B = [ stats.scoreatpercentile(tsoc_B_Zcl[tsoc_B_Zcl[:, ii] != 0, ii], 25) if len(tsoc_B_Zcl[tsoc_B_Zcl[:, ii] != 0, ii]) != 0 else 0 for ii in nc ] sig_B = np.abs(seldict['tsoc_B']) > np.tile( sig_B, [seldict['tsoc_B'].shape[0], 1]) veinmask = utils.andb([ t2s < stats.scoreatpercentile( t2s[t2s != 0], 15, interpolation_method='lower'), t2s != 0 ]) == 2 veinmaskf = veinmask[mask] veinR = np.array(sig_B[veinmaskf].sum(0), dtype=float) / sig_B[~veinmaskf].sum(0) veinR[np.isnan(veinR)] = 0 veinc = np.union1d(rej, midk) rej_veinRZ = ((veinR - veinR[veinc].mean()) / veinR[veinc].std())[veinc] rej_veinRZ[rej_veinRZ < 0] = 0 rej_veinRZ[countsigFR2[veinc] > np.array(veinmaskf, dtype=int).sum()] = 0 t2s_lim = [ stats.scoreatpercentile(t2s[t2s != 0], 50, interpolation_method='lower'), stats.scoreatpercentile( t2s[t2s != 0], 80, interpolation_method='lower') / 2 ] phys_var_zs = [] for t2sl_i in range(len(t2s_lim)): t2sl = t2s_lim[t2sl_i] veinW = sig_B[:, veinc] * np.tile(rej_veinRZ, [sig_B.shape[0], 1]) veincand = utils.unmask( utils.andb([ s0[t2s != 0] < np.median(s0[t2s != 0]), t2s[t2s != 0] < t2sl ]) >= 1, t2s != 0)[mask] veinW[~veincand] = 0 invein = veinW.sum( axis=1)[(utils.unmask(veinmaskf, mask) * utils.unmask(veinW.sum(axis=1) > 1, mask))[mask]] minW = 10 * (np.log10(invein).mean()) - 1 * 10**( np.log10(invein).std()) veinmaskB = veinW.sum(axis=1) > minW tsoc_Bp = seldict['tsoc_B'].copy() tsoc_Bp[tsoc_Bp < 0] = 0 vvex = np.array([ (tsoc_Bp[veinmaskB, ii]**2.).sum() / (tsoc_Bp[:, ii]**2.).sum() for ii in nc ]) group0_res = np.intersect1d(KRguess, group0) phys_var_zs.append( (vvex - vvex[group0_res].mean()) / vvex[group0_res].std()) veinBout = utils.unmask(veinmaskB, mask) utils.filewrite(veinBout.astype(float), 'veins_l%i' % t2sl_i, ref_img) # Mask to sample veins phys_var_z = np.array(phys_var_zs).max(0) Vz2 = (varex_ - varex_[group0].mean()) / varex_[group0].std() """ Step 4: Learn joint TE-dependence spatial and temporal models to move remaining artifacts to ignore class """ LGR.debug( 'Learning joint TE-dependence spatial/temporal models to ignore remaining artifacts' ) to_ign = [] minK_ign = np.max([F05, getelbow_cons(seldict['Kappas'], val=True)]) newcest = len(group0) + len( toacc_hi[seldict['Kappas'][toacc_hi] > minK_ign]) phys_art = np.setdiff1d( nc[utils.andb([phys_var_z > 3.5, seldict['Kappas'] < minK_ign]) == 2], group0) rank_diff = stats.rankdata(phys_var_z) - stats.rankdata(seldict['Kappas']) phys_art = np.union1d( np.setdiff1d( nc[utils.andb([phys_var_z > 2, rank_diff > newcest / 2, Vz2 > -1]) == 3], group0), phys_art) # Want to replace field_art with an acf/SVM based approach # instead of a kurtosis/filter one field_art = np.setdiff1d( nc[utils.andb([mmix_kurt_z_max > 5, seldict['Kappas'] < minK_ign]) == 2], group0) field_art = np.union1d( np.setdiff1d( nc[utils.andb([ mmix_kurt_z_max > 2, (stats.rankdata(mmix_kurt_z_max) - stats.rankdata(seldict['Kappas'])) > newcest / 2, Vz2 > 1, seldict['Kappas'] < F01 ]) == 4], group0), field_art) field_art = np.union1d( np.setdiff1d( nc[utils.andb([ mmix_kurt_z_max > 3, Vz2 > 3, seldict['Rhos'] > np.percentile(seldict['Rhos'][group0], 75) ]) == 3], group0), field_art) field_art = np.union1d( np.setdiff1d(nc[utils.andb([mmix_kurt_z_max > 5, Vz2 > 5]) == 2], group0), field_art) misc_art = np.setdiff1d( nc[utils.andb([(stats.rankdata(Vz) - stats.rankdata(Ktz)) > newcest / 2, seldict['Kappas'] < Khighelbowval]) == 2], group0) ign_cand = np.unique(list(field_art) + list(phys_art) + list(misc_art)) midkrej = np.union1d(midk, rej) to_ign = np.setdiff1d(list(ign_cand), midkrej) toacc = np.union1d(toacc_hi, toacc_lo) ncl = np.setdiff1d(np.union1d(ncl, toacc), np.union1d(to_ign, midkrej)) ign = np.setdiff1d(nc, list(ncl) + list(midk) + list(rej)) orphan = np.setdiff1d(nc, list(ncl) + list(to_ign) + list(midk) + list(rej)) # Last ditch effort to save some transient components if not strict_mode: Vz3 = (varex_ - varex_[ncl].mean()) / varex_[ncl].std() ncl = np.union1d( ncl, np.intersect1d( orphan, nc[utils.andb([ seldict['Kappas'] > F05, seldict['Rhos'] < F025, seldict['Kappas'] > seldict['Rhos'], Vz3 <= -1, Vz3 > -3, mmix_kurt_z_max < 2.5 ]) == 6])) ign = np.setdiff1d(nc, list(ncl) + list(midk) + list(rej)) orphan = np.setdiff1d( nc, list(ncl) + list(to_ign) + list(midk) + list(rej)) if savecsdiag: diagstep_keys = [ 'Rejected components', 'Kappa-Rho cut point', 'Kappa cut', 'Rho cut', 'DBSCAN failed to converge', 'Kappa-Rho guess', 'Dice rejected', 'rej_supp', 'to_clf', 'Mid-kappa components', 'svm_acc_fail', 'toacc_hi', 'toacc_lo', 'Field artifacts', 'Physiological artifacts', 'Miscellaneous artifacts', 'ncl', 'Ignored components' ] diagstep_vals = [ list(rej), KRcut.item(), Kcut.item(), Rcut.item(), dbscanfailed, list(KRguess), dice_rej, list(rej_supp), list(to_clf), list(midk), svm_acc_fail, list(toacc_hi), list(toacc_lo), list(field_art), list(phys_art), list(misc_art), list(ncl), list(ign) ] with open('csstepdata.json', 'w') as ofh: json.dump(dict(zip(diagstep_keys, diagstep_vals)), ofh, indent=4, sort_keys=True, default=str) allfz = np.array([Tz, Vz, Ktz, KRr, cnz, Rz, mmix_kurt, fdist_z]) np.savetxt('csdata.txt', allfz) return list(sorted(ncl)), list(sorted(rej)), list(sorted(midk)), list( sorted(ign))
def fitmodels_direct(catd, mmix, mask, t2s, t2sG, tes, combmode, ref_img, fout=None, reindex=False, mmixN=None, full_sel=True): """ Fit models directly. Parameters ---------- catd : (S x E x T) array_like Input data, where `S` is samples, `E` is echos, and `T` is time mmix : (T x C) array_like Mixing matrix for converting input data to component space, where `C` is components and `T` is the same as in `catd` mask : (S,) array_like Boolean mask array t2s : (S,) array_like t2sG : (S,) array_like tes : list List of echo times associated with `catd`, in milliseconds combmode : {'t2s', 'ste'} str How optimal combination of echos should be made, where 't2s' indicates using the method of Posse 1999 and 'ste' indicates using the method of Poser 2006 ref_img : str or img_like Reference image to dictate how outputs are saved to disk fout : bool Whether to output per-component TE-dependence maps. Default: None reindex : bool, optional Default: False mmixN : array_like, optional Default: None full_sel : bool, optional Whether to perform selection of components based on Rho/Kappa scores. Default: True Returns ------- seldict : dict comptab : (N x 5) :obj:`numpy.ndarray` Array with columns denoting (1) index of component, (2) Kappa score of component, (3) Rho score of component, (4) variance explained by component, and (5) normalized variance explained bycomponent betas : :obj:`numpy.ndarray` mmix_new : :obj:`numpy.ndarray` """ # compute optimal combination of raw data tsoc = model.make_optcom(catd, t2sG, tes, mask, combmode, verbose=False).astype(float)[mask] # demean optimal combination tsoc_dm = tsoc - tsoc.mean(axis=-1, keepdims=True) # compute un-normalized weight dataset (features) if mmixN is None: mmixN = mmix WTS = computefeats2(utils.unmask(tsoc, mask), mmixN, mask, normalize=False) # compute PSC dataset - shouldn't have to refit data tsoc_B = get_coeffs(utils.unmask(tsoc_dm, mask), mask, mmix)[mask] tsoc_Babs = np.abs(tsoc_B) PSC = tsoc_B / tsoc.mean(axis=-1, keepdims=True) * 100 # compute skews to determine signs based on unnormalized weights, # correct mmix & WTS signs based on spatial distribution tails signs = stats.skew(WTS, axis=0) signs /= np.abs(signs) mmix = mmix.copy() mmix *= signs WTS *= signs PSC *= signs totvar = (tsoc_B**2).sum() totvar_norm = (WTS**2).sum() # compute Betas and means over TEs for TE-dependence analysis betas = get_coeffs(catd, np.repeat(mask[:, np.newaxis], len(tes), axis=1), mmix) n_samp, n_echos, n_components = betas.shape n_voxels = mask.sum() n_data_voxels = (t2s != 0).sum() mu = catd.mean(axis=-1, dtype=float) tes = np.reshape(tes, (n_echos, 1)) fmin, fmid, fmax = utils.getfbounds(n_echos) # mask arrays mumask = mu[t2s != 0] t2smask = t2s[t2s != 0] betamask = betas[t2s != 0] # set up Xmats X1 = mumask.T # Model 1 X2 = np.tile(tes, (1, n_data_voxels)) * mumask.T / t2smask.T # Model 2 # tables for component selection Kappas = np.zeros([n_components]) Rhos = np.zeros([n_components]) varex = np.zeros([n_components]) varex_norm = np.zeros([n_components]) Z_maps = np.zeros([n_voxels, n_components]) F_R2_maps = np.zeros([n_data_voxels, n_components]) F_S0_maps = np.zeros([n_data_voxels, n_components]) Z_clmaps = np.zeros([n_voxels, n_components]) F_R2_clmaps = np.zeros([n_data_voxels, n_components]) F_S0_clmaps = np.zeros([n_data_voxels, n_components]) Br_clmaps_R2 = np.zeros([n_voxels, n_components]) Br_clmaps_S0 = np.zeros([n_voxels, n_components]) LGR.info('Fitting TE- and S0-dependent models to components') for i in range(n_components): # size of B is (n_components, nx*ny*nz) B = np.atleast_3d(betamask)[:, :, i].T alpha = (np.abs(B)**2).sum(axis=0) varex[i] = (tsoc_B[:, i]**2).sum() / totvar * 100. varex_norm[i] = (utils.unmask(WTS, mask)[t2s != 0][:, i]** 2).sum() / totvar_norm * 100. # S0 Model coeffs_S0 = (B * X1).sum(axis=0) / (X1**2).sum(axis=0) SSE_S0 = (B - X1 * np.tile(coeffs_S0, (n_echos, 1)))**2 SSE_S0 = SSE_S0.sum(axis=0) F_S0 = (alpha - SSE_S0) * 2 / (SSE_S0) F_S0_maps[:, i] = F_S0 # R2 Model coeffs_R2 = (B * X2).sum(axis=0) / (X2**2).sum(axis=0) SSE_R2 = (B - X2 * np.tile(coeffs_R2, (n_echos, 1)))**2 SSE_R2 = SSE_R2.sum(axis=0) F_R2 = (alpha - SSE_R2) * 2 / (SSE_R2) F_R2_maps[:, i] = F_R2 # compute weights as Z-values wtsZ = (WTS[:, i] - WTS[:, i].mean()) / WTS[:, i].std() wtsZ[np.abs(wtsZ) > Z_MAX] = ( Z_MAX * (np.abs(wtsZ) / wtsZ))[np.abs(wtsZ) > Z_MAX] Z_maps[:, i] = wtsZ # compute Kappa and Rho F_S0[F_S0 > F_MAX] = F_MAX F_R2[F_R2 > F_MAX] = F_MAX norm_weights = np.abs( np.squeeze(utils.unmask(wtsZ, mask)[t2s != 0]**2.)) Kappas[i] = np.average(F_R2, weights=norm_weights) Rhos[i] = np.average(F_S0, weights=norm_weights) # tabulate component values comptab_pre = np.vstack( [np.arange(n_components), Kappas, Rhos, varex, varex_norm]).T if reindex: # re-index all components in Kappa order comptab = comptab_pre[comptab_pre[:, 1].argsort()[::-1], :] Kappas = comptab[:, 1] Rhos = comptab[:, 2] varex = comptab[:, 3] varex_norm = comptab[:, 4] nnc = np.array(comptab[:, 0], dtype=np.int) mmix_new = mmix[:, nnc] F_S0_maps = F_S0_maps[:, nnc] F_R2_maps = F_R2_maps[:, nnc] Z_maps = Z_maps[:, nnc] WTS = WTS[:, nnc] PSC = PSC[:, nnc] tsoc_B = tsoc_B[:, nnc] tsoc_Babs = tsoc_Babs[:, nnc] comptab[:, 0] = np.arange(comptab.shape[0]) else: comptab = comptab_pre mmix_new = mmix # full selection including clustering criteria seldict = None if full_sel: LGR.info('Performing spatial clustering of components') csize = np.max([int(n_voxels * 0.0005) + 5, 20]) LGR.debug('Using minimum cluster size: {}'.format(csize)) for i in range(n_components): # save out files out = np.zeros((n_samp, 4)) out[:, 0] = np.squeeze(utils.unmask(PSC[:, i], mask)) out[:, 1] = np.squeeze(utils.unmask(F_R2_maps[:, i], t2s != 0)) out[:, 2] = np.squeeze(utils.unmask(F_S0_maps[:, i], t2s != 0)) out[:, 3] = np.squeeze(utils.unmask(Z_maps[:, i], mask)) if utils.get_dtype(ref_img) == 'GIFTI': continue # TODO: pass through GIFTI file data as below ccimg = utils.new_nii_like(ref_img, out) # Do simple clustering on F sel = spatclust(ccimg, min_cluster_size=csize, threshold=int(fmin), index=[1, 2], mask=(t2s != 0)) F_R2_clmaps[:, i] = sel[:, 0] F_S0_clmaps[:, i] = sel[:, 1] countsigFR2 = F_R2_clmaps[:, i].sum() countsigFS0 = F_S0_clmaps[:, i].sum() # Do simple clustering on Z at p<0.05 sel = spatclust(ccimg, min_cluster_size=csize, threshold=1.95, index=3, mask=mask) Z_clmaps[:, i] = sel # Do simple clustering on ranked signal-change map spclust_input = utils.unmask(stats.rankdata(tsoc_Babs[:, i]), mask) spclust_input = utils.new_nii_like(ref_img, spclust_input) Br_clmaps_R2[:, i] = spatclust(spclust_input, min_cluster_size=csize, threshold=max(tsoc_Babs.shape) - countsigFR2, mask=mask) Br_clmaps_S0[:, i] = spatclust(spclust_input, min_cluster_size=csize, threshold=max(tsoc_Babs.shape) - countsigFS0, mask=mask) seldict = {} selvars = [ 'Kappas', 'Rhos', 'WTS', 'varex', 'Z_maps', 'F_R2_maps', 'F_S0_maps', 'Z_clmaps', 'F_R2_clmaps', 'F_S0_clmaps', 'tsoc_B', 'Br_clmaps_R2', 'Br_clmaps_S0', 'PSC' ] for vv in selvars: seldict[vv] = eval(vv) return seldict, comptab, betas, mmix_new
def tedpca(catd, OCcatd, combmode, mask, t2s, t2sG, stabilize, ref_img, tes, kdaw, rdaw, ste=0, wvpca=False): """ Use principal components analysis (PCA) to identify and remove thermal noise from multi-echo data. Parameters ---------- catd : (S x E x T) array_like Input functional data OCcatd : (S x T) array_like Optimally-combined time series data combmode : {'t2s', 'ste'} str How optimal combination of echos should be made, where 't2s' indicates using the method of Posse 1999 and 'ste' indicates using the method of Poser 2006 mask : (S,) array_like Boolean mask array stabilize : :obj:`bool` Whether to attempt to stabilize convergence of ICA by returning dimensionally-reduced data from PCA and component selection. ref_img : :obj:`str` or img_like Reference image to dictate how outputs are saved to disk tes : :obj:`list` List of echo times associated with `catd`, in milliseconds kdaw : :obj:`float` Dimensionality augmentation weight for Kappa calculations rdaw : :obj:`float` Dimensionality augmentation weight for Rho calculations ste : :obj:`int` or :obj:`list` of :obj:`int`, optional Which echos to use in PCA. Values -1 and 0 are special, where a value of -1 will indicate using all the echos and 0 will indicate using the optimal combination of the echos. A list can be provided to indicate a subset of echos. Default: 0 wvpca : :obj:`bool`, optional Whether to apply wavelet denoising to data. Default: False Returns ------- n_components : :obj:`int` Number of components retained from PCA decomposition dd : (S x T) :obj:`numpy.ndarray` Dimensionally reduced optimally combined functional data Notes ----- ====================== ================================================= Notation Meaning ====================== ================================================= :math:`\\kappa` Component pseudo-F statistic for TE-dependent (BOLD) model. :math:`\\rho` Component pseudo-F statistic for TE-independent (artifact) model. :math:`v` Voxel :math:`V` Total number of voxels in mask :math:`\\zeta` Something :math:`c` Component :math:`p` Something else ====================== ================================================= Steps: 1. Variance normalize either multi-echo or optimally combined data, depending on settings. 2. Decompose normalized data using PCA or SVD. 3. Compute :math:`{\\kappa}` and :math:`{\\rho}`: .. math:: {\\kappa}_c = \\frac{\sum_{v}^V {\\zeta}_{c,v}^p * \ F_{c,v,R_2^*}}{\sum {\\zeta}_{c,v}^p} {\\rho}_c = \\frac{\sum_{v}^V {\\zeta}_{c,v}^p * \ F_{c,v,S_0}}{\sum {\\zeta}_{c,v}^p} 4. Some other stuff. Something about elbows. 5. Classify components as thermal noise if they meet both of the following criteria: - Nonsignificant :math:`{\\kappa}` and :math:`{\\rho}`. - Nonsignificant variance explained. Outputs: This function writes out several files: ====================== ================================================= Filename Content ====================== ================================================= pcastate.pkl Values from PCA results. comp_table_pca.txt PCA component table. mepca_mix.1D PCA mixing matrix. ====================== ================================================= """ n_samp, n_echos, n_vols = catd.shape ste = np.array([int(ee) for ee in str(ste).split(',')]) if len(ste) == 1 and ste[0] == -1: LGR.info('Computing PCA of optimally combined multi-echo data') d = OCcatd[utils.make_min_mask(OCcatd[:, np.newaxis, :])][:, np.newaxis, :] elif len(ste) == 1 and ste[0] == 0: LGR.info('Computing PCA of spatially concatenated multi-echo data') d = catd[mask].astype('float64') else: LGR.info('Computing PCA of echo #%s' % ','.join([str(ee) for ee in ste])) d = np.stack([catd[mask, ee] for ee in ste - 1], axis=1).astype('float64') eim = np.squeeze(eimask(d)) d = np.squeeze(d[eim]) dz = ((d.T - d.T.mean(axis=0)) / d.T.std(axis=0)).T # var normalize ts dz = (dz - dz.mean()) / dz.std() # var normalize everything if wvpca: dz, cAl = dwtmat(dz) if not op.exists('pcastate.pkl'): voxel_comp_weights, varex, comp_ts = run_svd(dz) # actual variance explained (normalized) varex_norm = varex / varex.sum() eigenvalue_elbow = getelbow(varex_norm, return_val=True) diff_varex_norm = np.abs(np.diff(varex_norm)) lower_diff_varex_norm = diff_varex_norm[(len(diff_varex_norm) // 2):] varex_norm_thr = np.mean( [lower_diff_varex_norm.max(), diff_varex_norm.min()]) varex_norm_min = varex_norm[ (len(diff_varex_norm) // 2) + np.arange(len(lower_diff_varex_norm)) [lower_diff_varex_norm >= varex_norm_thr][0] + 1] varex_norm_cum = np.cumsum(varex_norm) # Compute K and Rho for PCA comps eimum = np.atleast_2d(eim) eimum = np.transpose(eimum, np.argsort(eimum.shape)[::-1]) eimum = eimum.prod(axis=1) o = np.zeros((mask.shape[0], *eimum.shape[1:])) o[mask] = eimum eimum = np.squeeze(o).astype(bool) vTmix = comp_ts.T vTmixN = ((vTmix.T - vTmix.T.mean(0)) / vTmix.T.std(0)).T LGR.info('Making initial component selection guess from PCA results') _, ct_df, betasv, v_T = model.fitmodels_direct(catd, comp_ts.T, eimum, t2s, t2sG, tes, combmode, ref_img, mmixN=vTmixN, full_sel=False) # varex_norm overrides normalized varex computed by fitmodels_direct ct_df['normalized variance explained'] = varex_norm # Save state fname = op.abspath('pcastate.pkl') LGR.info('Saving PCA results to: {}'.format(fname)) pcastate = { 'voxel_comp_weights': voxel_comp_weights, 'varex': varex, 'comp_ts': comp_ts, 'comptable': ct_df, 'eigenvalue_elbow': eigenvalue_elbow, 'varex_norm_min': varex_norm_min, 'varex_norm_cum': varex_norm_cum } try: with open(fname, 'wb') as handle: pickle.dump(pcastate, handle) except TypeError: LGR.warning('Could not save PCA solution') else: # if loading existing state LGR.info('Loading PCA from: pcastate.pkl') with open('pcastate.pkl', 'rb') as handle: pcastate = pickle.load(handle) voxel_comp_weights, varex = pcastate['voxel_comp_weights'], pcastate[ 'varex'] comp_ts = pcastate['comp_ts'] ct_df = pcastate['comptable'] eigenvalue_elbow = pcastate['eigenvalue_elbow'] varex_norm_min = pcastate['varex_norm_min'] varex_norm_cum = pcastate['varex_norm_cum'] np.savetxt('mepca_mix.1D', comp_ts.T) # write component maps to 4D image comp_maps = np.zeros((OCcatd.shape[0], comp_ts.shape[0])) for i_comp in range(comp_ts.shape[0]): temp_comp_ts = comp_ts[i_comp, :][:, None] comp_map = utils.unmask( model.computefeats2(OCcatd, temp_comp_ts, mask), mask) comp_maps[:, i_comp] = np.squeeze(comp_map) io.filewrite(comp_maps, 'mepca_OC_components.nii', ref_img) fmin, fmid, fmax = utils.getfbounds(n_echos) kappa_thr = np.average(sorted( [fmin, getelbow(ct_df['kappa'], return_val=True) / 2, fmid]), weights=[kdaw, 1, 1]) rho_thr = np.average(sorted( [fmin, getelbow_cons(ct_df['rho'], return_val=True) / 2, fmid]), weights=[rdaw, 1, 1]) if int(kdaw) == -1: lim_idx = utils.andb([ct_df['kappa'] < fmid, ct_df['kappa'] > fmin]) == 2 kappa_lim = ct_df.loc[lim_idx, 'kappa'].values kappa_thr = kappa_lim[getelbow(kappa_lim)] lim_idx = utils.andb([ct_df['rho'] < fmid, ct_df['rho'] > fmin]) == 2 rho_lim = ct_df.loc[lim_idx, 'rho'].values rho_thr = rho_lim[getelbow(rho_lim)] stabilize = True elif int(rdaw) == -1: lim_idx = utils.andb([ct_df['rho'] < fmid, ct_df['rho'] > fmin]) == 2 rho_lim = ct_df.loc[lim_idx, 'rho'].values rho_thr = rho_lim[getelbow(rho_lim)] # Add new columns to comptable for classification ct_df['classification'] = 'accepted' ct_df['rationale'] = '' # Reject if low Kappa, Rho, and variance explained is_lowk = ct_df['kappa'] <= kappa_thr is_lowr = ct_df['rho'] <= rho_thr is_lowe = ct_df['normalized variance explained'] <= eigenvalue_elbow is_lowkre = is_lowk & is_lowr & is_lowe ct_df.loc[is_lowkre, 'classification'] = 'rejected' ct_df.loc[is_lowkre, 'rationale'] += 'low rho, kappa, and varex;' # Reject if low variance explained is_lows = ct_df['normalized variance explained'] <= varex_norm_min ct_df.loc[is_lows, 'classification'] = 'rejected' ct_df.loc[is_lows, 'rationale'] += 'low variance explained;' # Reject if Kappa over limit is_fmax1 = ct_df['kappa'] == F_MAX ct_df.loc[is_fmax1, 'classification'] = 'rejected' ct_df.loc[is_fmax1, 'rationale'] += 'kappa equals fmax;' # Reject if Rho over limit is_fmax2 = ct_df['rho'] == F_MAX ct_df.loc[is_fmax2, 'classification'] = 'rejected' ct_df.loc[is_fmax2, 'rationale'] += 'rho equals fmax;' if stabilize: temp7 = varex_norm_cum >= 0.95 ct_df.loc[temp7, 'classification'] = 'rejected' ct_df.loc[temp7, 'rationale'] += 'cumulative var. explained above 95%;' under_fmin1 = ct_df['kappa'] <= fmin ct_df.loc[under_fmin1, 'classification'] = 'rejected' ct_df.loc[under_fmin1, 'rationale'] += 'kappa below fmin;' under_fmin2 = ct_df['rho'] <= fmin ct_df.loc[under_fmin2, 'classification'] = 'rejected' ct_df.loc[under_fmin2, 'rationale'] += 'rho below fmin;' ct_df.to_csv('comp_table_pca.txt', sep='\t', index=True, index_label='component', float_format='%.6f') sel_idx = ct_df['classification'] == 'accepted' n_components = np.sum(sel_idx) voxel_kept_comp_weighted = (voxel_comp_weights[:, sel_idx] * varex[None, sel_idx]) kept_data = np.dot(voxel_kept_comp_weighted, comp_ts[sel_idx, :]) if wvpca: kept_data = idwtmat(kept_data, cAl) LGR.info('Selected {0} components with Kappa threshold: {1:.02f}, ' 'Rho threshold: {2:.02f}'.format(n_components, kappa_thr, rho_thr)) kept_data = stats.zscore(kept_data, axis=1) # variance normalize timeseries kept_data = stats.zscore(kept_data, axis=None) # variance normalize everything return n_components, kept_data
def test_getfbounds(): good_inputs = range(1, 12) for n_echos in good_inputs: utils.getfbounds(n_echos)
def tedpca(catd, OCcatd, combmode, mask, t2s, t2sG, stabilize, ref_img, tes, kdaw, rdaw, ste=0, mlepca=True): """ Use principal components analysis (PCA) to identify and remove thermal noise from multi-echo data. Parameters ---------- catd : (S x E x T) array_like Input functional data OCcatd : (S x T) array_like Optimally-combined time series data combmode : {'t2s', 'ste'} str How optimal combination of echos should be made, where 't2s' indicates using the method of Posse 1999 and 'ste' indicates using the method of Poser 2006 mask : (S,) array_like Boolean mask array stabilize : bool Whether to attempt to stabilize convergence of ICA by returning dimensionally-reduced data from PCA and component selection. ref_img : str or img_like Reference image to dictate how outputs are saved to disk tes : list List of echo times associated with `catd`, in milliseconds kdaw : float Dimensionality augmentation weight for Kappa calculations rdaw : float Dimensionality augmentation weight for Rho calculations ste : int or list-of-int, optional Which echos to use in PCA. Values -1 and 0 are special, where a value of -1 will indicate using all the echos and 0 will indicate using the optimal combination of the echos. A list can be provided to indicate a subset of echos. Default: 0 mlepca : bool, optional Whether to use the method originally explained in Minka, NIPS 2000 for guessing PCA dimensionality instead of a traditional SVD. Default: True Returns ------- n_components : int Number of components retained from PCA decomposition dd : (S x E x T) :obj:`numpy.ndarray` Dimensionally-reduced functional data Notes ----- ====================== ================================================= Notation Meaning ====================== ================================================= :math:`\\kappa` Component pseudo-F statistic for TE-dependent (BOLD) model. :math:`\\rho` Component pseudo-F statistic for TE-independent (artifact) model. :math:`v` Voxel :math:`V` Total number of voxels in mask :math:`\\zeta` Something :math:`c` Component :math:`p` Something else ====================== ================================================= Steps: 1. Variance normalize either multi-echo or optimally combined data, depending on settings. 2. Decompose normalized data using PCA or SVD. 3. Compute :math:`{\\kappa}` and :math:`{\\rho}`: .. math:: {\\kappa}_c = \\frac{\sum_{v}^V {\\zeta}_{c,v}^p * \ F_{c,v,R_2^*}}{\sum {\\zeta}_{c,v}^p} {\\rho}_c = \\frac{\sum_{v}^V {\\zeta}_{c,v}^p * \ F_{c,v,S_0}}{\sum {\\zeta}_{c,v}^p} 4. Some other stuff. Something about elbows. 5. Classify components as thermal noise if they meet both of the following criteria: - Nonsignificant :math:`{\\kappa}` and :math:`{\\rho}`. - Nonsignificant variance explained. """ n_samp, n_echos, n_vols = catd.shape ste = np.array([int(ee) for ee in str(ste).split(',')]) if len(ste) == 1 and ste[0] == -1: LGR.info('Computing PCA of optimally combined multi-echo data') d = OCcatd[utils.make_min_mask(OCcatd[:, np.newaxis, :])][:, np.newaxis, :] elif len(ste) == 1 and ste[0] == 0: LGR.info('Computing PCA of spatially concatenated multi-echo data') d = catd[mask].astype('float64') else: LGR.info('Computing PCA of echo #%s' % ','.join([str(ee) for ee in ste])) d = np.stack([catd[mask, ee] for ee in ste - 1], axis=1).astype('float64') eim = np.squeeze(eimask(d)) d = np.squeeze(d[eim]) dz = ((d.T - d.T.mean(axis=0)) / d.T.std(axis=0)).T # var normalize ts dz = (dz - dz.mean()) / dz.std() # var normalize everything if not op.exists('pcastate.pkl'): # do PC dimension selection and get eigenvalue cutoff if mlepca: from sklearn.decomposition import PCA ppca = PCA(n_components='mle', svd_solver='full') ppca.fit(dz) v = ppca.components_ s = ppca.explained_variance_ u = np.dot(np.dot(dz, v.T), np.diag(1. / s)) else: u, s, v = np.linalg.svd(dz, full_matrices=0) # actual variance explained (normalized) sp = s / s.sum() eigelb = getelbow_mod(sp, val=True) spdif = np.abs(np.diff(sp)) spdifh = spdif[(len(spdif)//2):] spdthr = np.mean([spdifh.max(), spdif.min()]) spmin = sp[(len(spdif)//2) + np.arange(len(spdifh))[spdifh >= spdthr][0] + 1] spcum = np.cumsum(sp) # Compute K and Rho for PCA comps eimum = np.atleast_2d(eim) eimum = np.transpose(eimum, np.argsort(eimum.shape)[::-1]) eimum = eimum.prod(axis=1) o = np.zeros((mask.shape[0], *eimum.shape[1:])) o[mask] = eimum eimum = np.squeeze(o).astype(bool) vTmix = v.T vTmixN = ((vTmix.T - vTmix.T.mean(0)) / vTmix.T.std(0)).T LGR.info('Making initial component selection guess from PCA results') _, ctb, betasv, v_T = model.fitmodels_direct(catd, v.T, eimum, t2s, t2sG, tes, combmode, ref_img, mmixN=vTmixN, full_sel=False) ctb = ctb[ctb[:, 0].argsort(), :] ctb = np.vstack([ctb.T[:3], sp]).T # Save state fname = op.abspath('pcastate.pkl') LGR.info('Saving PCA results to: {}'.format(fname)) pcastate = {'u': u, 's': s, 'v': v, 'ctb': ctb, 'eigelb': eigelb, 'spmin': spmin, 'spcum': spcum} try: with open(fname, 'wb') as handle: pickle.dump(pcastate, handle) except TypeError: LGR.warning('Could not save PCA solution') else: # if loading existing state LGR.info('Loading PCA from: {}'.format('pcastate.pkl')) with open('pcastate.pkl', 'rb') as handle: pcastate = pickle.load(handle) u, s, v = pcastate['u'], pcastate['s'], pcastate['v'] ctb, eigelb = pcastate['ctb'], pcastate['eigelb'] spmin, spcum = pcastate['spmin'], pcastate['spcum'] np.savetxt('comp_table_pca.txt', ctb[ctb[:, 1].argsort(), :][::-1]) np.savetxt('mepca_mix.1D', v[ctb[:, 1].argsort()[::-1], :].T) kappas = ctb[ctb[:, 1].argsort(), 1] rhos = ctb[ctb[:, 2].argsort(), 2] fmin, fmid, fmax = utils.getfbounds(n_echos) kappa_thr = np.average(sorted([fmin, getelbow_mod(kappas, val=True)/2, fmid]), weights=[kdaw, 1, 1]) rho_thr = np.average(sorted([fmin, getelbow_cons(rhos, val=True)/2, fmid]), weights=[rdaw, 1, 1]) if int(kdaw) == -1: kappas_lim = kappas[utils.andb([kappas < fmid, kappas > fmin]) == 2] kappa_thr = kappas_lim[getelbow_mod(kappas_lim)] rhos_lim = rhos[utils.andb([rhos < fmid, rhos > fmin]) == 2] rho_thr = rhos_lim[getelbow_mod(rhos_lim)] stabilize = True if int(kdaw) != -1 and int(rdaw) == -1: rhos_lim = rhos[utils.andb([rhos < fmid, rhos > fmin]) == 2] rho_thr = rhos_lim[getelbow_mod(rhos_lim)] is_hik = np.array(ctb[:, 1] > kappa_thr, dtype=np.int) is_hir = np.array(ctb[:, 2] > rho_thr, dtype=np.int) is_hie = np.array(ctb[:, 3] > eigelb, dtype=np.int) is_his = np.array(ctb[:, 3] > spmin, dtype=np.int) is_not_fmax1 = np.array(ctb[:, 1] != F_MAX, dtype=np.int) is_not_fmax2 = np.array(ctb[:, 2] != F_MAX, dtype=np.int) pcscore = (is_hik + is_hir + is_hie) * is_his * is_not_fmax1 * is_not_fmax2 if stabilize: temp7 = np.array(spcum < 0.95, dtype=np.int) temp8 = np.array(ctb[:, 2] > fmin, dtype=np.int) temp9 = np.array(ctb[:, 1] > fmin, dtype=np.int) pcscore = pcscore * temp7 * temp8 * temp9 pcsel = pcscore > 0 dd = u.dot(np.diag(s*np.array(pcsel, dtype=np.int))).dot(v) n_components = s[pcsel].shape[0] LGR.info('Selected {0} components with Kappa threshold: {1:.02f}, ' 'Rho threshold: {2:.02f}'.format(n_components, kappa_thr, rho_thr)) dd = stats.zscore(dd.T, axis=0).T # variance normalize timeseries dd = stats.zscore(dd, axis=None) # variance normalize everything return n_components, dd
def kundu_tedpca(comptable, n_echos, kdaw=10., rdaw=1., stabilize=False): """ Select PCA components using Kundu's decision tree approach. Parameters ---------- comptable : :obj:`pandas.DataFrame` Component table with relevant metrics: kappa, rho, and normalized variance explained. Component number should be the index. n_echos : :obj:`int` Number of echoes in dataset. kdaw : :obj:`float`, optional Kappa dimensionality augmentation weight. Must be a non-negative float, or -1 (a special value). Default is 10. rdaw : :obj:`float`, optional Rho dimensionality augmentation weight. Must be a non-negative float, or -1 (a special value). Default is 1. stabilize : :obj:`bool`, optional Whether to stabilize convergence by reducing dimensionality, for low quality data. Default is False. Returns ------- comptable : :obj:`pandas.DataFrame` Component table with components classified as 'accepted', 'rejected', or 'ignored'. """ LGR.info('Performing PCA component selection with Kundu decision tree') comptable['classification'] = 'accepted' comptable['rationale'] = '' eigenvalue_elbow = getelbow(comptable['normalized variance explained'], return_val=True) diff_varex_norm = np.abs( np.diff(comptable['normalized variance explained'])) lower_diff_varex_norm = diff_varex_norm[(len(diff_varex_norm) // 2):] varex_norm_thr = np.mean( [lower_diff_varex_norm.max(), diff_varex_norm.min()]) varex_norm_min = comptable['normalized variance explained'][ (len(diff_varex_norm) // 2) + np.arange(len(lower_diff_varex_norm))[ lower_diff_varex_norm >= varex_norm_thr][0] + 1] varex_norm_cum = np.cumsum(comptable['normalized variance explained']) fmin, fmid, fmax = utils.getfbounds(n_echos) if int(kdaw) == -1: lim_idx = utils.andb( [comptable['kappa'] < fmid, comptable['kappa'] > fmin]) == 2 kappa_lim = comptable.loc[lim_idx, 'kappa'].values kappa_thr = kappa_lim[getelbow(kappa_lim)] lim_idx = utils.andb( [comptable['rho'] < fmid, comptable['rho'] > fmin]) == 2 rho_lim = comptable.loc[lim_idx, 'rho'].values rho_thr = rho_lim[getelbow(rho_lim)] stabilize = True LGR.info('kdaw set to -1. Switching TEDPCA method to ' 'kundu-stabilize') elif int(rdaw) == -1: lim_idx = utils.andb( [comptable['rho'] < fmid, comptable['rho'] > fmin]) == 2 rho_lim = comptable.loc[lim_idx, 'rho'].values rho_thr = rho_lim[getelbow(rho_lim)] else: kappa_thr = np.average(sorted( [fmin, (getelbow(comptable['kappa'], return_val=True) / 2), fmid]), weights=[kdaw, 1, 1]) rho_thr = np.average(sorted([ fmin, (getelbow_cons(comptable['rho'], return_val=True) / 2), fmid ]), weights=[rdaw, 1, 1]) # Reject if low Kappa, Rho, and variance explained is_lowk = comptable['kappa'] <= kappa_thr is_lowr = comptable['rho'] <= rho_thr is_lowe = comptable['normalized variance explained'] <= eigenvalue_elbow is_lowkre = is_lowk & is_lowr & is_lowe comptable.loc[is_lowkre, 'classification'] = 'rejected' comptable.loc[is_lowkre, 'rationale'] += 'P001;' # Reject if low variance explained is_lows = comptable['normalized variance explained'] <= varex_norm_min comptable.loc[is_lows, 'classification'] = 'rejected' comptable.loc[is_lows, 'rationale'] += 'P002;' # Reject if Kappa over limit is_fmax1 = comptable['kappa'] == F_MAX comptable.loc[is_fmax1, 'classification'] = 'rejected' comptable.loc[is_fmax1, 'rationale'] += 'P003;' # Reject if Rho over limit is_fmax2 = comptable['rho'] == F_MAX comptable.loc[is_fmax2, 'classification'] = 'rejected' comptable.loc[is_fmax2, 'rationale'] += 'P004;' if stabilize: temp7 = varex_norm_cum >= 0.95 comptable.loc[temp7, 'classification'] = 'rejected' comptable.loc[temp7, 'rationale'] += 'P005;' under_fmin1 = comptable['kappa'] <= fmin comptable.loc[under_fmin1, 'classification'] = 'rejected' comptable.loc[under_fmin1, 'rationale'] += 'P006;' under_fmin2 = comptable['rho'] <= fmin comptable.loc[under_fmin2, 'classification'] = 'rejected' comptable.loc[under_fmin2, 'rationale'] += 'P007;' n_components = comptable.loc[comptable['classification'] == 'accepted'].shape[0] LGR.info('Selected {0} components with Kappa threshold: {1:.02f}, Rho ' 'threshold: {2:.02f}'.format(n_components, kappa_thr, rho_thr)) # Move decision columns to end cols_at_end = ['classification', 'rationale'] comptable = comptable[[c for c in comptable if c not in cols_at_end] + [c for c in cols_at_end if c in comptable]] comptable['rationale'] = comptable['rationale'].str.rstrip(';') return comptable
def fitmodels_direct(catd, mmix, mask, t2s, t2s_full, tes, combmode, ref_img, reindex=False, mmixN=None, full_sel=True, label=None, out_dir='.', verbose=False): """ Fit TE-dependence and -independence models to components. Parameters ---------- catd : (S x E x T) array_like Input data, where `S` is samples, `E` is echos, and `T` is time mmix : (T x C) array_like Mixing matrix for converting input data to component space, where `C` is components and `T` is the same as in `catd` mask : (S [x E]) array_like Boolean mask array t2s : (S [x T]) array_like Limited T2* map or timeseries. t2s_full : (S [x T]) array_like Full T2* map or timeseries. For voxels with good signal in only one echo, which are zeros in the limited T2* map, this map uses the T2* estimate using the first two echoes. tes : list List of echo times associated with `catd`, in milliseconds combmode : {'t2s', 'paid'} str How optimal combination of echos should be made, where 't2s' indicates using the method of Posse 1999 and 'paid' indicates using the method of Poser 2006 ref_img : str or img_like Reference image to dictate how outputs are saved to disk reindex : bool, optional Default: False mmixN : array_like, optional Default: None full_sel : bool, optional Whether to perform selection of components based on Rho/Kappa scores. Default: True Returns ------- seldict : dict comptable : (C x X) :obj:`pandas.DataFrame` Component metric table. One row for each component, with a column for each metric. The index is the component number. betas : :obj:`numpy.ndarray` mmix_new : :obj:`numpy.ndarray` """ if not (catd.shape[0] == t2s.shape[0] == t2s_full.shape[0] == mask.shape[0]): raise ValueError('First dimensions (number of samples) of catd ({0}), ' 't2s ({1}), and mask ({2}) do not ' 'match'.format(catd.shape[0], t2s.shape[0], mask.shape[0])) elif catd.shape[1] != len(tes): raise ValueError('Second dimension of catd ({0}) does not match ' 'number of echoes provided (tes; ' '{1})'.format(catd.shape[1], len(tes))) elif catd.shape[2] != mmix.shape[0]: raise ValueError('Third dimension (number of volumes) of catd ({0}) ' 'does not match first dimension of ' 'mmix ({1})'.format(catd.shape[2], mmix.shape[0])) elif t2s.shape != t2s_full.shape: raise ValueError('Shape of t2s array {0} does not match shape of ' 't2s_full array {1}'.format(t2s.shape, t2s_full.shape)) elif t2s.ndim == 2: if catd.shape[2] != t2s.shape[1]: raise ValueError('Third dimension (number of volumes) of catd ' '({0}) does not match second dimension of ' 't2s ({1})'.format(catd.shape[2], t2s.shape[1])) mask = t2s != 0 # Override mask because problems # compute optimal combination of raw data tsoc = combine.make_optcom(catd, tes, mask, t2s=t2s_full, combmode=combmode, verbose=False).astype(float)[mask] # demean optimal combination tsoc_dm = tsoc - tsoc.mean(axis=-1, keepdims=True) # compute un-normalized weight dataset (features) if mmixN is None: mmixN = mmix WTS = computefeats2(utils.unmask(tsoc, mask), mmixN, mask, normalize=False) # compute PSC dataset - shouldn't have to refit data tsoc_B = get_coeffs(tsoc_dm, mmix, mask=None) tsoc_Babs = np.abs(tsoc_B) PSC = tsoc_B / tsoc.mean(axis=-1, keepdims=True) * 100 # compute skews to determine signs based on unnormalized weights, # correct mmix & WTS signs based on spatial distribution tails signs = stats.skew(WTS, axis=0) signs /= np.abs(signs) mmix = mmix.copy() mmix *= signs WTS *= signs PSC *= signs totvar = (tsoc_B**2).sum() totvar_norm = (WTS**2).sum() # compute Betas and means over TEs for TE-dependence analysis betas = get_coeffs(catd, mmix, np.repeat(mask[:, np.newaxis], len(tes), axis=1)) n_samp, n_echos, n_components = betas.shape n_voxels = mask.sum() n_data_voxels = (t2s != 0).sum() mu = catd.mean(axis=-1, dtype=float) tes = np.reshape(tes, (n_echos, 1)) fmin, _, _ = utils.getfbounds(n_echos) # mask arrays mumask = mu[t2s != 0] t2smask = t2s[t2s != 0] betamask = betas[t2s != 0] # set up Xmats X1 = mumask.T # Model 1 X2 = np.tile(tes, (1, n_data_voxels)) * mumask.T / t2smask.T # Model 2 # tables for component selection kappas = np.zeros([n_components]) rhos = np.zeros([n_components]) varex = np.zeros([n_components]) varex_norm = np.zeros([n_components]) Z_maps = np.zeros([n_voxels, n_components]) F_R2_maps = np.zeros([n_data_voxels, n_components]) F_S0_maps = np.zeros([n_data_voxels, n_components]) Z_clmaps = np.zeros([n_voxels, n_components]) F_R2_clmaps = np.zeros([n_data_voxels, n_components]) F_S0_clmaps = np.zeros([n_data_voxels, n_components]) Br_R2_clmaps = np.zeros([n_voxels, n_components]) Br_S0_clmaps = np.zeros([n_voxels, n_components]) pred_R2_maps = np.zeros([n_data_voxels, n_echos, n_components]) pred_S0_maps = np.zeros([n_data_voxels, n_echos, n_components]) LGR.info('Fitting TE- and S0-dependent models to components') for i_comp in range(n_components): # size of B is (n_echoes, n_samples) B = np.atleast_3d(betamask)[:, :, i_comp].T alpha = (np.abs(B)**2).sum(axis=0) varex[i_comp] = (tsoc_B[:, i_comp]**2).sum() / totvar * 100. varex_norm[i_comp] = (utils.unmask(WTS, mask)[t2s != 0][:, i_comp]**2).sum() /\ totvar_norm * 100. # S0 Model # (S,) model coefficient map coeffs_S0 = (B * X1).sum(axis=0) / (X1**2).sum(axis=0) pred_S0 = X1 * np.tile(coeffs_S0, (n_echos, 1)) pred_S0_maps[:, :, i_comp] = pred_S0.T SSE_S0 = (B - pred_S0)**2 SSE_S0 = SSE_S0.sum(axis=0) # (S,) prediction error map F_S0 = (alpha - SSE_S0) * (n_echos - 1) / (SSE_S0) F_S0_maps[:, i_comp] = F_S0 # R2 Model coeffs_R2 = (B * X2).sum(axis=0) / (X2**2).sum(axis=0) pred_R2 = X2 * np.tile(coeffs_R2, (n_echos, 1)) pred_R2_maps[:, :, i_comp] = pred_R2.T SSE_R2 = (B - pred_R2)**2 SSE_R2 = SSE_R2.sum(axis=0) F_R2 = (alpha - SSE_R2) * (n_echos - 1) / (SSE_R2) F_R2_maps[:, i_comp] = F_R2 # compute weights as Z-values wtsZ = (WTS[:, i_comp] - WTS[:, i_comp].mean()) / WTS[:, i_comp].std() wtsZ[np.abs(wtsZ) > Z_MAX] = ( Z_MAX * (np.abs(wtsZ) / wtsZ))[np.abs(wtsZ) > Z_MAX] Z_maps[:, i_comp] = wtsZ # compute Kappa and Rho F_S0[F_S0 > F_MAX] = F_MAX F_R2[F_R2 > F_MAX] = F_MAX norm_weights = np.abs( np.squeeze(utils.unmask(wtsZ, mask)[t2s != 0]**2.)) kappas[i_comp] = np.average(F_R2, weights=norm_weights) rhos[i_comp] = np.average(F_S0, weights=norm_weights) # tabulate component values comptable = np.vstack([kappas, rhos, varex, varex_norm]).T if reindex: # re-index all components in Kappa order sort_idx = comptable[:, 0].argsort()[::-1] comptable = comptable[sort_idx, :] mmix_new = mmix[:, sort_idx] betas = betas[..., sort_idx] pred_R2_maps = pred_R2_maps[:, :, sort_idx] pred_S0_maps = pred_S0_maps[:, :, sort_idx] F_S0_maps = F_S0_maps[:, sort_idx] F_R2_maps = F_R2_maps[:, sort_idx] Z_maps = Z_maps[:, sort_idx] WTS = WTS[:, sort_idx] PSC = PSC[:, sort_idx] tsoc_B = tsoc_B[:, sort_idx] tsoc_Babs = tsoc_Babs[:, sort_idx] else: mmix_new = mmix if verbose: # Echo-specific weight maps for each of the ICA components. io.filewrite(betas, op.join(out_dir, '{0}betas_catd.nii'.format(label)), ref_img) # Echo-specific maps of predicted values for R2 and S0 models for each # component. io.filewrite(utils.unmask(pred_R2_maps, mask), op.join(out_dir, '{0}R2_pred.nii'.format(label)), ref_img) io.filewrite(utils.unmask(pred_S0_maps, mask), op.join(out_dir, '{0}S0_pred.nii'.format(label)), ref_img) # Weight maps used to average metrics across voxels io.filewrite(utils.unmask(Z_maps**2., mask), op.join(out_dir, '{0}metric_weights.nii'.format(label)), ref_img) comptable = pd.DataFrame(comptable, columns=[ 'kappa', 'rho', 'variance explained', 'normalized variance explained' ]) comptable.index.name = 'component' # full selection including clustering criteria seldict = None if full_sel: LGR.info('Performing spatial clustering of components') csize = np.max([int(n_voxels * 0.0005) + 5, 20]) LGR.debug('Using minimum cluster size: {}'.format(csize)) for i_comp in range(n_components): # Cluster-extent threshold and binarize F-maps ccimg = io.new_nii_like( ref_img, np.squeeze(utils.unmask(F_R2_maps[:, i_comp], t2s != 0))) F_R2_clmaps[:, i_comp] = utils.threshold_map(ccimg, min_cluster_size=csize, threshold=fmin, mask=mask, binarize=True) countsigFR2 = F_R2_clmaps[:, i_comp].sum() ccimg = io.new_nii_like( ref_img, np.squeeze(utils.unmask(F_S0_maps[:, i_comp], t2s != 0))) F_S0_clmaps[:, i_comp] = utils.threshold_map(ccimg, min_cluster_size=csize, threshold=fmin, mask=mask, binarize=True) countsigFS0 = F_S0_clmaps[:, i_comp].sum() # Cluster-extent threshold and binarize Z-maps with CDT of p < 0.05 ccimg = io.new_nii_like( ref_img, np.squeeze(utils.unmask(Z_maps[:, i_comp], t2s != 0))) Z_clmaps[:, i_comp] = utils.threshold_map(ccimg, min_cluster_size=csize, threshold=1.95, mask=mask, binarize=True) # Cluster-extent threshold and binarize ranked signal-change map ccimg = io.new_nii_like( ref_img, utils.unmask(stats.rankdata(tsoc_Babs[:, i_comp]), t2s != 0)) Br_R2_clmaps[:, i_comp] = utils.threshold_map( ccimg, min_cluster_size=csize, threshold=(max(tsoc_Babs.shape) - countsigFR2), mask=mask, binarize=True) Br_S0_clmaps[:, i_comp] = utils.threshold_map( ccimg, min_cluster_size=csize, threshold=(max(tsoc_Babs.shape) - countsigFS0), mask=mask, binarize=True) seldict = {} selvars = [ 'WTS', 'tsoc_B', 'PSC', 'Z_maps', 'F_R2_maps', 'F_S0_maps', 'Z_clmaps', 'F_R2_clmaps', 'F_S0_clmaps', 'Br_R2_clmaps', 'Br_S0_clmaps' ] for vv in selvars: seldict[vv] = eval(vv) return seldict, comptable, betas, mmix_new
def selcomps(seldict, mmix, mask, ref_img, manacc, n_echos, t2s, s0, olevel=2, oversion=99, filecsdata=True, savecsdiag=True, strict_mode=False): """ Labels ICA components to keep or remove from denoised data The selection process uses pre-calculated parameters for each ICA component inputted into this function in `seldict` such as Kappa (a T2* weighting metric), Rho (an S0 weighting metric), and variance explained. Additonal selection metrics are calculated within this function and then used to classify each component into one of four groups. Parameters ---------- seldict : :obj:`dict` As output from `fitmodels_direct` mmix : (C x T) array_like Mixing matrix for converting input data to component space, where `C` is components and `T` is the number of volumes in the original data mask : (S,) array_like Boolean mask array ref_img : :obj:`str` or img_like Reference image to dictate how outputs are saved to disk manacc : :obj:`list` Comma-separated list of indices of manually accepted components n_echos : :obj:`int` Number of echos in original data t2s : (S,) array_like Estimated T2* map s0 : (S,) array_like S0 map olevel : :obj:`int`, optional Default: 2 oversion : :obj:`int`, optional Default: 99 filecsdata: :obj:`bool`, optional Default: False savecsdiag: :obj:`bool`, optional Default: True strict_mode: :obj:`bool`, optional Default: False Returns ------- acc : :obj:`list` Indices of accepted (BOLD) components in `mmix` rej : :obj:`list` Indices of rejected (non-BOLD) components in `mmix` midk : :obj:`list` Indices of mid-K (questionable) components in `mmix` These components are typically removed from the data during denoising ign : :obj:`list` Indices of ignored components in `mmix` Ignored components are considered to have too low variance to matter. They are not processed through the accept vs reject decision tree and are NOT removed during the denoising process Notes ----- The selection algorithm used in this function is from work by prantikk It is from selcomps function in select_model_fft20e.py in version 3.2 of MEICA at: https://github.com/ME-ICA/me-ica/blob/b2781dd087ab9de99a2ec3925f04f02ce84f0adc/meica.libs/select_model_fft20e.py Many of the early publications using and evaulating the MEICA method used a different selection algorithm by prantikk. The final 2.5 version of that algorithm in the selcomps function in select_model.py at: https://github.com/ME-ICA/me-ica/blob/b2781dd087ab9de99a2ec3925f04f02ce84f0adc/meica.libs/select_model.py In both algorithms, the ICA component selection process uses multiple metrics that include: kappa, rho, variance explained, compent spatial weighting maps, noise and spatial frequency metrics, and measures of spatial overlap across metrics. The precise calculations may vary between algorithms. The most notable difference is that the v2.5 algorithm is a fixed decision tree where all sections were made based on whether combinations of metrics crossed various thresholds. In the v3.5 algorithm, clustering and support vector machines are also used to classify components based on how similar metrics in one component are similar to metrics in other components. """ if mmix.ndim != 2: raise ValueError('Parameter mmix should be 2d, not {0}d'.format(mmix.ndim)) elif t2s.ndim != 1: # FIT not necessarily supported raise ValueError('Parameter t2s should be 1d, not {0}d'.format(t2s.ndim)) elif s0.ndim != 1: # FIT not necessarily supported raise ValueError('Parameter s0 should be 1d, not {0}d'.format(s0.ndim)) elif not (t2s.shape[0] == s0.shape[0] == mask.shape[0]): raise ValueError('First dimensions (number of samples) of t2s ({0}), ' 's0 ({1}), and mask ({2}) do not ' 'match'.format(t2s.shape[0], s0.shape[0], mask.shape[0])) """ handwerkerd and others are working to "hypercomment" this function to help everyone understand it sufficiently with the goal of eventually modularizing the algorithm. This is still a work-in-process with later sections not fully commented, some points of uncertainty are noted, and the summary of the full algorithm is not yet complete. There are sections of this code that calculate metrics that are used in the decision tree for the selection process and other sections that are part of the decision tree. Certain comments are prefaced with METRIC and variable names to make clear which are metrics and others are prefaced with SELECTION to make clear which are for applying metrics. METRICs tend to be summary values that contain a signal number per component. Note there are some variables that are calculated in one section of the code that are later transformed into another metric that is actually part of a selection criterion. This running list is an attempt to summarize intermediate metrics vs the metrics that are actually used in decision steps. For applied metrics that are made up of intermediate metrics defined in earlier sections of the code, the constituent metrics are noted. More metrics will be added to the applied metrics section as the commenting of this function continues. Intermediate Metrics: seldict['F_S0_clmaps'] seldict['F_R2_clmaps'] seldict['Br_clmaps_S0'] seldict['Br_clmaps_R2'] seldict['Z_maps'] dice_tbl countnoise counts_FR2_Z tt_table mmix_kurt mmix_std spr fproj_arr_val fdist Rtz, Dz Applied Metrics: seldict['Rhos'] seldict['Kappas'] seldict['varex'] countsigFS0 countsigFR2 fz (a combination of multiple z-scored metrics: tt_table, seldict['varex'], seldict['Kappa'], seldict['Rho'], countnoise, mmix_kurt, fdist) tt_table[:,0] spz (z score of spr) KRcut """ """ If seldict exists, save it into a pickle file called compseldata.pklbz that can be loaded directly into python for future analyses If seldict=None, load it from the pre-saved pickle file to use for the rest of this function """ if filecsdata: import bz2 if seldict is not None: LGR.info('Saving component selection data') with bz2.BZ2File('compseldata.pklbz', 'wb') as csstate_f: pickle.dump(seldict, csstate_f) else: try: with bz2.BZ2File('compseldata.pklbz', 'rb') as csstate_f: seldict = pickle.load(csstate_f) except FileNotFoundError: LGR.warning('Failed to load component selection data') return None """ List of components all_comps and acc_comps start out as an ordered list of the component numbers all_comps is constant throughout the function. acc_comps changes through his function as components are assigned to other categories (i.e. components that are classified as rejected are removed from acc_comps) """ midk = [] ign = [] all_comps = np.arange(len(seldict['Kappas'])) acc_comps = np.arange(len(seldict['Kappas'])) """ If user has specified components to accept manually, just assign those components to the accepted and rejected comp lists and end the function """ if manacc: acc = sorted([int(vv) for vv in manacc.split(',')]) midk = [] rej = sorted(np.setdiff1d(all_comps, acc)) ign = [] return acc, rej, midk, ign # Add string for ign """ METRICS: countsigFS0 countsigFR2 F_S0_clmaps & F_R2_clmaps are the thresholded & binarized clustered maps of significant fits for the separate S0 and R2 cross-echo models per component. Since the values are 0 or 1, the countsig variables are a count of the significant voxels per component. The cluster size is a function of the # of voxels in the mask. The cluster threshold is based on the # of echos acquired """ countsigFS0 = seldict['F_S0_clmaps'].sum(0) countsigFR2 = seldict['F_R2_clmaps'].sum(0) countnoise = np.zeros(len(all_comps)) """ Make table of dice values METRICS: dice_tbl dice_FR2, dice_FS0 are calculated for each component and the concatenated values are in dice_tbl Br_clmaps_R2 and Br_clmaps_S0 are binarized clustered Z_maps. The volume being clustered is the rank order indices of the absolute value of the beta values for the fit between the optimally combined time series and the mixing matrix (i.e. the lowest beta value is 1 and the highest is the # of voxels). The cluster size is a function of the # of voxels in the mask. The cluster threshold are the voxels with beta ranks greater than countsigFS0 or countsigFR2 (i.e. roughly the same number of voxels will be in the countsig clusters as the ICA beta map clusters) These dice values are the Dice-Sorenson index for the Br_clmap_?? and the F_??_clmap. If handwerkerd understands this correctly, if the voxels with the above threshold F stats are clustered in the same voxels with the highest beta values, then the dice coefficient will be 1. If the thresholded F or betas aren't spatially clustered (i.e. the component map is less spatially smooth) or the clusters are in different locations (i.e. voxels with high betas are also noiser so they have lower F values), then the dice coefficients will be lower """ dice_tbl = np.zeros([all_comps.shape[0], 2]) for comp_num in all_comps: dice_FR2 = utils.dice(utils.unmask(seldict['Br_clmaps_R2'][:, comp_num], mask)[t2s != 0], seldict['F_R2_clmaps'][:, comp_num]) dice_FS0 = utils.dice(utils.unmask(seldict['Br_clmaps_S0'][:, comp_num], mask)[t2s != 0], seldict['F_S0_clmaps'][:, comp_num]) dice_tbl[comp_num, :] = [dice_FR2, dice_FS0] # step 3a here and above dice_tbl[np.isnan(dice_tbl)] = 0 """ Make table of noise gain METRICS: countnoise, counts_FR2_Z, tt_table (This is a bit confusing & is handwerkerd's attempt at making sense of this) seldict['Z_maps'] is the Fisher Z normalized beta fits for the optimally combined time series and the mixing matrix. Z_clmaps is a binarized cluster of Z_maps with the cluster size based on the # of voxels and the cluster threshold of 1.95. utils.andb is a sum of the True values in arrays so comp_noise_sel is true for voxels where the Z values are greater than 1.95 but not part of a cluster of Z values that are greater than 1.95. Spatially unclustered voxels with high Z values could be considerd noisy. countnoise is the # of voxels per component where comp_noise_sel is true. counts_FR2_Z is the number of voxels with Z values above the threshold that are in clusters (signal) and the number outside of clusters (noise) tt_table is a bit confusing. For each component, the first index is some type of normalized, log10, signal/noise t statistic and the second is the p value for the signal/noise t statistic (for the R2 model). In general, these should be bigger t or have lower p values when most of the Z values above threshold are inside clusters. Because of the log10, values below 1 are negative, which is later used as a threshold. It doesn't seem like the p values are ever used. """ tt_table = np.zeros([len(all_comps), 4]) counts_FR2_Z = np.zeros([len(all_comps), 2]) for comp_num in all_comps: comp_noise_sel = utils.andb([np.abs(seldict['Z_maps'][:, comp_num]) > 1.95, seldict['Z_clmaps'][:, comp_num] == 0]) == 2 countnoise[comp_num] = np.array(comp_noise_sel, dtype=np.int).sum() noise_FR2_Z_mask = utils.unmask(comp_noise_sel, mask)[t2s != 0] noise_FR2_Z = np.log10(np.unique(seldict['F_R2_maps'][noise_FR2_Z_mask, comp_num])) signal_FR2_Z_mask = utils.unmask(seldict['Z_clmaps'][:, comp_num], mask)[t2s != 0] == 1 signal_FR2_Z = np.log10(np.unique(seldict['F_R2_maps'][signal_FR2_Z_mask, comp_num])) counts_FR2_Z[comp_num, :] = [len(signal_FR2_Z), len(noise_FR2_Z)] ttest = stats.ttest_ind(signal_FR2_Z, noise_FR2_Z, equal_var=True) # avoid DivideByZero RuntimeWarning if signal_FR2_Z.size > 0 and noise_FR2_Z.size > 0: mwu = stats.norm.ppf(stats.mannwhitneyu(signal_FR2_Z, noise_FR2_Z)[1]) else: mwu = -np.inf tt_table[comp_num, 0] = np.abs(mwu) * ttest[0] / np.abs(ttest[0]) tt_table[comp_num, 1] = ttest[1] tt_table[np.isnan(tt_table)] = 0 tt_table[np.isinf(tt_table[:, 0]), 0] = np.percentile(tt_table[~np.isinf(tt_table[:, 0]), 0], 98) """ Time series derivative kurtosis METRICS: mmix_kurt and mmix_std Take the derivative of the time series for each component in the ICA mixing matrix and calculate the kurtosis & standard deviation. handwerkerd thinks these metrics are later used to calculate measures of time series spikiness and drift in the component time series. """ mmix_dt = (mmix[:-1, :] - mmix[1:, :]) mmix_kurt = stats.kurtosis(mmix_dt) mmix_std = np.std(mmix_dt, axis=0) """ SELECTION #1 (prantikk labeled "Step 1") Reject anything that is obviously an artifact Obvious artifacts are components with Rho>Kappa or with more clustered, significant voxels for the S0 model than the R2 model """ LGR.debug('Rejecting gross artifacts based on Rho/Kappa values and S0/R2 ' 'counts') rej = acc_comps[utils.andb([seldict['Rhos'] > seldict['Kappas'], countsigFS0 > countsigFR2]) > 0] acc_comps = np.setdiff1d(acc_comps, rej) """ prantikk labeled "Step 2" Compute 3-D spatial FFT of Beta maps to detect high-spatial frequency artifacts METRIC spr, fproj_arr_val, fdist PSC is the mean centered beta map for each ICA component The FFT is sequentially calculated across each dimension of PSC & the max value is removed (probably the 0Hz bin). The maximum remaining frequency magnitude along the z dimenions is calculated leaving a 2D matrix. spr contains a count of the number of frequency bins in the 2D matrix where the frequency magnitude is greater than 4* the maximum freq in the matrix. spr is later z-normed across components into spz and this is actually used as a selection metric. handwerkerd interpretation: spr is bigger the more values of the fft are >1/4 the max. Thus, if you assume the highest mag bins are low frequency, & all components have roughly the same low freq power (i.e. a brain-shaped blob), then spr will be bigger the more high frequency bins have magnitudes that are more than 1/4 of the low frequency bins. fproj_arr_val is a flattened 1D vector of the 2D max projection fft of each component. This seems to be later used in an SVM to train on this value for rejected components to classify some remaining n_components as midk Note: fproj_arr is created here and is a ranked list of FFT values, but is not used anywhere in the code. Was fproj_arr_val supposed to contain this ranking? fdist isn't completely clear to handwerkerd yet but it looks like the fit of the fft of the spatial map to a Gaussian distribution. If so, then the worse the fit, the more high frequency power would be in the component """ LGR.debug('Computing 3D spatial FFT of beta maps to detect high-spatial frequency artifacts') # spatial information is important so for NIFTI we convert back to 3D space if utils.get_dtype(ref_img) == 'NIFTI': dim1 = np.prod(check_niimg(ref_img).shape[:2]) else: dim1 = mask.shape[0] fproj_arr = np.zeros([dim1, len(all_comps)]) fproj_arr_val = np.zeros([dim1, len(all_comps)]) spr = [] fdist = [] for comp_num in all_comps: # convert data back to 3D array if utils.get_dtype(ref_img) == 'NIFTI': tproj = utils.new_nii_like(ref_img, utils.unmask(seldict['PSC'], mask)[:, comp_num]).get_data() else: tproj = utils.unmask(seldict['PSC'], mask)[:, comp_num] fproj = np.fft.fftshift(np.abs(np.fft.rfftn(tproj))) fproj_z = fproj.max(axis=-1) fproj[fproj == fproj.max()] = 0 spr.append(np.array(fproj_z > fproj_z.max() / 4, dtype=np.int).sum()) fproj_arr[:, comp_num] = stats.rankdata(fproj_z.flatten()) fproj_arr_val[:, comp_num] = fproj_z.flatten() if utils.get_dtype(ref_img) == 'NIFTI': fprojr = np.array([fproj, fproj[:, :, ::-1]]).max(0) fdist.append(np.max([utils.fitgaussian(fproj.max(jj))[3:].max() for jj in range(fprojr.ndim)])) else: fdist = np.load(os.path.join(RESOURCES, 'fdist.npy')) if type(fdist) is not np.ndarray: fdist = np.array(fdist) spr = np.array(spr) # import ipdb; ipdb.set_trace() """ prantikk labelled Step 3 Create feature space of component properties METRIC fz, spz, Rtz, Dz fz is matrix of multiple other metrics described above and calculated in this section. Most are all of these have one number per component and they are z-scored across components Attempted explanations in order: Tz: The z-scored t statistics of the spatial noisiness metric in tt_table Vz: The z-scored the natural log of the non-normalized variance explained of each component Ktz: The z-scored natural log of the Kappa values (the '/ 2' does not seem necessary beacuse it will be removed by z-scoring) KRr: The z-scored ratio of the natural log of Kappa / nat log of Rho (unclear why sometimes using stats.zcore and other times writing the eq out) cnz: The z-scored measure of a noisy voxel count where the noisy voxels are the voxels with large beta estimates, but aren't part of clusters Rz: z-scored rho values (why aren't this log scaled, like kappa in Ktz?) mmix_kurt: Probably a rough measure of the spikiness of each component's time series in the ICA mixing matrix fdist_z: z-score of fdist, which is probably a measure of high freq info in the spatial FFT of the components (with lower being more high freq?) NOT in fz: spz: Z-scored measure probably of how much high freq is in the data. Larger values mean more bins of the FFT have over 1/4 the power of the maximum bin (read about spr above for more info) Rtz: Z-scored natural log of the Rho values Dz: Z-scored Fisher Z transformed dice values of the overlap between clusters for the F stats and clusters of the ICA spatial beta maps with roughly the same number of voxels as in the clustered F maps. Dz saves this for the R2 model, there are also Dice coefs for the S0 model in dice_tbl """ LGR.debug('Creating feature space of component properties') fdist_pre = fdist.copy() fdist_pre[fdist > np.median(fdist) * 3] = np.median(fdist) * 3 fdist_z = (fdist_pre - np.median(fdist_pre)) / fdist_pre.std() # not z spz = stats.zscore(spr) Tz = stats.zscore(tt_table[:, 0]) varex_log = np.log(seldict['varex']) Vz = stats.zscore(varex_log) Rz = stats.zscore(seldict['Rhos']) Ktz = stats.zscore(np.log(seldict['Kappas']) / 2) # Rtz = stats.zscore(np.log(seldict['Rhos']) / 2) KRr = stats.zscore(np.log(seldict['Kappas']) / np.log(seldict['Rhos'])) cnz = stats.zscore(countnoise) Dz = stats.zscore(np.arctanh(dice_tbl[:, 0] + 0.001)) fz = np.array([Tz, Vz, Ktz, KRr, cnz, Rz, mmix_kurt, fdist_z]) """ METRICS Kcut, Rcut, KRcut, KRcutguesses, Khighelbowval Step 3: Make initial guess of where BOLD components are and use DBSCAN to exclude noise components and find a sample set of 'good' components """ LGR.debug('Making initial guess of BOLD components') # The F threshold for the echo fit (based on the # of echos) for p<0.05 # p<0.025, and p<0.001 (Confirm this is accurate since the function # contains a lookup table rather than a calculation) F05, F025, F01 = utils.getfbounds(n_echos) # epsmap is [index,level of overlap with dicemask, # number of high Rho components] epsmap = [] Rhos_sorted = np.array(sorted(seldict['Rhos']))[::-1] """ Make an initial guess as to number of good components based on consensus of control points across Rhos and Kappas For terminology later, typically getelbow _aggr > _mod > _cons though this might not be universally true. A more "inclusive" threshold has a lower kappa since that means more components are above that thresh and are likely to be accepted. For Rho, a more "inclusive" threshold is higher since that means fewer components will be rejected based on rho. KRcut seems weird to handwerkerd. I see that the thresholds are slightly shifted for kappa & rho later in the code, but why would we ever want to set a common threhsold reference point for both? These are two different elbows on two different data sets. """ KRcutguesses = [getelbow_mod(seldict['Rhos']), getelbow_cons(seldict['Rhos']), getelbow_aggr(seldict['Rhos']), getelbow_mod(seldict['Kappas']), getelbow_cons(seldict['Kappas']), getelbow_aggr(seldict['Kappas'])] KRcut = np.median(KRcutguesses) """ Also a bit weird to handwerkerd. This is the 75th percentile of Kappa F stats of the components with the 3 elbow selection criteria and the F states for 3 significance thresholds based on the # of echos. This is some type of way to get a significance criterion for a component fit, but it's include why this specific criterion is useful. """ Khighelbowval = stats.scoreatpercentile([getelbow_mod(seldict['Kappas'], return_val=True), getelbow_cons(seldict['Kappas'], return_val=True), getelbow_aggr(seldict['Kappas'], return_val=True)] + list(utils.getfbounds(n_echos)), 75, interpolation_method='lower') """ Default to the most inclusive kappa threshold (_cons) unless: 1. That threshold is more than twice the median of Kappa & Rho thresholds 2. and the moderate elbow is more inclusive than a p=0.01 handwerkerd: This actually seems like a way to avoid using the theoretically most liberal threshold only when there was a bad estimate and _mod is is more inclusive. My one concern is that it's an odd way to test that the _mod elbow is any better. Why not at least see if _mod < _cons? prantikk's orig comment for this section is: "only use exclusive when inclusive is extremely inclusive - double KRcut" """ cond1 = getelbow_cons(seldict['Kappas']) > KRcut * 2 cond2 = getelbow_mod(seldict['Kappas'], return_val=True) < F01 if cond1 and cond2: Kcut = getelbow_mod(seldict['Kappas'], return_val=True) else: Kcut = getelbow_cons(seldict['Kappas'], return_val=True) """ handwerkerd: The goal seems to be to maximize the rejected components based on the rho cut by defaulting to a lower Rcut value. Again, if that is the goal, why not just test if _mod < _cons? prantikk's orig comment for this section is: only use inclusive when exclusive is extremely exclusive - half KRcut (remember for Rho inclusive is higher, so want both Kappa and Rho to defaut to lower) """ if getelbow_cons(seldict['Rhos']) > KRcut * 2: Rcut = getelbow_mod(seldict['Rhos'], return_val=True) # for above, consider something like: # min([getelbow_mod(Rhos,True),sorted(Rhos)[::-1][KRguess] ]) else: Rcut = getelbow_cons(seldict['Rhos'], return_val=True) # Rcut should never be higher than Kcut (handwerkerd: not sure why) if Rcut > Kcut: Kcut = Rcut # KRelbow has a 2 for components that are above the Kappa accept threshold # and below the rho reject threshold KRelbow = utils.andb([seldict['Kappas'] > Kcut, seldict['Rhos'] < Rcut]) """ Make guess of Kundu et al 2011 plus remove high frequencies, generally high variance, and high variance given low Kappa the first index of tt_table is a t static of a what handwerkerd thinks is a spatial noise metric. Since log10 of these values are taken the >0 threshold means the metric is >1. tt_lim seems to be a fairly aggressive percentile that is then divided by 3. """ tt_lim = stats.scoreatpercentile(tt_table[tt_table[:, 0] > 0, 0], 75, interpolation_method='lower') / 3 """ KRguess is a list of components to potentially accept. it starts with a list of components that cross the Kcut and Rcut threshold and weren't previously rejected for other reasons. From that list, it removes more components based on several additional criteria: 1. tt_table less than the tt_lim threshold (spatial noisiness metric) 2. spz (a z-scored probably high spatial freq metric) >1 3. Vz (a z-scored variance explained metric) >2 4. If both (seems to be if a component has a relatively high variance the acceptance threshold for Kappa values is doubled): A. The variance explained is greater than half the KRcut highest variance component B. Kappa is less than twice Kcut """ temp = all_comps[utils.andb([seldict['varex'] > 0.5 * sorted(seldict['varex'])[::-1][int(KRcut)], seldict['Kappas'] < 2*Kcut]) == 2] KRguess = np.setdiff1d(np.setdiff1d(all_comps[KRelbow == 2], rej), np.union1d(all_comps[tt_table[:, 0] < tt_lim], np.union1d(np.union1d(all_comps[spz > 1], all_comps[Vz > 2]), temp))) guessmask = np.zeros(len(all_comps)) guessmask[KRguess] = 1 """ Throw lower-risk bad components out based on 3 criteria all being true: 1. tt_table (a spatial noisiness metric) <0 2. A components variance explains is greater than the median variance explained 3. The component index is greater than the KRcut index. Since the components are sorted by kappa, this is another kappa thresholding) """ rejB = acc_comps[utils.andb([tt_table[acc_comps, 0] < 0, seldict['varex'][acc_comps] > np.median(seldict['varex']), acc_comps > KRcut]) == 3] rej = np.union1d(rej, rejB) # adjust acc_comps again to only contain the remaining non-rejected components acc_comps = np.setdiff1d(acc_comps, rej) """ This is where handwerkerd has paused in hypercommenting the function. """ LGR.debug('Using DBSCAN to find optimal set of "good" BOLD components') for ii in range(20000): eps = .005 + ii * .005 db = DBSCAN(eps=eps, min_samples=3).fit(fz.T) # it would be great to have descriptive names, here # DBSCAN found at least three non-noisy clusters cond1 = db.labels_.max() > 1 # DBSCAN didn't detect more classes than the total # of components / 6 cond2 = db.labels_.max() < len(all_comps) / 6 # TODO: confirm if 0 is a special label for DBSCAN # my intuition here is that we're confirming DBSCAN labelled previously # rejected components as noise (i.e., no overlap between `rej` and # labelled DBSCAN components) cond3 = np.intersect1d(rej, all_comps[db.labels_ == 0]).shape[0] == 0 # DBSCAN labelled less than half of the total components as noisy cond4 = np.array(db.labels_ == -1, dtype=int).sum() / float(len(all_comps)) < .5 if cond1 and cond2 and cond3 and cond4: epsmap.append([ii, utils.dice(guessmask, db.labels_ == 0), np.intersect1d(all_comps[db.labels_ == 0], all_comps[seldict['Rhos'] > getelbow_mod(Rhos_sorted, return_val=True)]).shape[0]]) db = None epsmap = np.array(epsmap) LGR.debug('Found DBSCAN solutions for {}/20000 eps resolutions'.format(len(epsmap))) group0 = [] dbscanfailed = False if len(epsmap) != 0: # Select index that maximizes Dice with guessmask but first # minimizes number of higher Rho components ii = int(epsmap[np.argmax(epsmap[epsmap[:, 2] == np.min(epsmap[:, 2]), 1], 0), 0]) LGR.debug('Component selection tuning: {:.05f}'.format(epsmap[:, 1].max())) db = DBSCAN(eps=.005+ii*.005, min_samples=3).fit(fz.T) acc_comps = all_comps[db.labels_ == 0] acc_comps = np.setdiff1d(acc_comps, rej) acc_comps = np.setdiff1d(acc_comps, acc_comps[acc_comps > len(all_comps) - len(rej)]) group0 = acc_comps.copy() group_n1 = all_comps[db.labels_ == -1] to_clf = np.setdiff1d(all_comps, np.union1d(acc_comps, rej)) if len(group0) == 0 or len(group0) < len(KRguess) * .5: dbscanfailed = True LGR.debug('DBSCAN guess failed; using elbow guess method instead') temp = all_comps[utils.andb([seldict['varex'] > 0.5 * sorted(seldict['varex'])[::-1][int(KRcut)], seldict['Kappas'] < 2 * Kcut]) == 2] acc_comps = np.setdiff1d(np.setdiff1d(all_comps[KRelbow == 2], rej), np.union1d(all_comps[tt_table[:, 0] < tt_lim], np.union1d(np.union1d(all_comps[spz > 1], all_comps[Vz > 2]), temp))) group0 = acc_comps.copy() group_n1 = [] to_clf = np.setdiff1d(all_comps, np.union1d(group0, rej)) if len(group0) < 2 or (len(group0) < 4 and float(len(rej))/len(group0) > 3): LGR.warning('Extremely limited reliable BOLD signal space! ' 'Not filtering components beyond BOLD/non-BOLD guesses.') midkfailed = True min_acc = np.array([]) if len(group0) != 0: # For extremes, building in a 20% tolerance toacc_hi = np.setdiff1d(all_comps[utils.andb([fdist <= np.max(fdist[group0]), seldict['Rhos'] < F025, Vz > -2]) == 3], np.union1d(group0, rej)) min_acc = np.union1d(group0, toacc_hi) to_clf = np.setdiff1d(all_comps, np.union1d(min_acc, rej)) else: toacc_hi = [] min_acc = [] diagstep_keys = ['Rejected components', 'Kappa-Rho cut point', 'Kappa cut point', 'Rho cut point', 'DBSCAN failed to converge', 'Mid-Kappa failed (limited BOLD signal)', 'Kappa-Rho guess', 'min_acc', 'toacc_hi'] diagstep_vals = [list(rej), KRcut, Kcut, Rcut, dbscanfailed, midkfailed, list(KRguess), list(min_acc), list(toacc_hi)] with open('csstepdata.json', 'w') as ofh: json.dump(dict(zip(diagstep_keys, diagstep_vals)), ofh, indent=4, sort_keys=True, default=str) return list(sorted(min_acc)), list(sorted(rej)), [], list(sorted(to_clf)) # Find additional components to reject based on Dice - doing this here # since Dice is a little unstable, need to reference group0 rej_supp = [] dice_rej = False if not dbscanfailed and len(rej) + len(group0) < 0.75 * len(all_comps): dice_rej = True temp = all_comps[dice_tbl[all_comps, 0] <= dice_tbl[all_comps, 1]] rej_supp = np.setdiff1d(np.setdiff1d(np.union1d(rej, temp), group0), group_n1) rej = np.union1d(rej, rej_supp) # Temporal features # larger is worse - spike mmix_kurt_z = (mmix_kurt-mmix_kurt[group0].mean()) / mmix_kurt[group0].std() # smaller is worse - drift mmix_std_z = -1 * ((mmix_std-mmix_std[group0].mean()) / mmix_std[group0].std()) mmix_kurt_z_max = np.max([mmix_kurt_z, mmix_std_z], 0) """ Step 2: Classifiy midk and ignore using separate SVMs for different variance regimes # To render hyperplane: min_x = np.min(spz2);max_x=np.max(spz2) # plotting separating hyperplane ww = clf_.coef_[0] aa = -ww[0] / ww[1] # make sure the next line is long enough xx = np.linspace(min_x - 2, max_x + 2) yy = aa * xx - (clf_.intercept_[0]) / ww[1] plt.plot(xx, yy, '-') """ LGR.debug('Attempting to classify midk components') # Tried getting rid of accepting based on SVM altogether, # now using only rejecting toacc_hi = np.setdiff1d(all_comps[utils.andb([fdist <= np.max(fdist[group0]), seldict['Rhos'] < F025, Vz > -2]) == 3], np.union1d(group0, rej)) temp = utils.andb([spz < 1, Rz < 0, mmix_kurt_z_max < 5, Dz > -1, Tz > -1, Vz < 0, seldict['Kappas'] >= F025, fdist < 3 * np.percentile(fdist[group0], 98)]) == 8 toacc_lo = np.intersect1d(to_clf, all_comps[temp]) midk_clf, clf_ = do_svm(fproj_arr_val[:, np.union1d(group0, rej)].T, [0] * len(group0) + [1] * len(rej), fproj_arr_val[:, to_clf].T, svmtype=2) midk = np.setdiff1d(to_clf[utils.andb([midk_clf == 1, seldict['varex'][to_clf] > np.median(seldict['varex'][group0])]) == 2], np.union1d(toacc_hi, toacc_lo)) # only use SVM to augment toacc_hi only if toacc_hi isn't already # conflicting with SVM choice if len(np.intersect1d(to_clf[utils.andb([midk_clf == 1, Vz[to_clf] > 0]) == 2], toacc_hi)) == 0: svm_acc_fail = True toacc_hi = np.union1d(toacc_hi, to_clf[midk_clf == 0]) else: svm_acc_fail = False """ Step 3: Compute variance associated with low T2* areas (e.g. draining veins and low T2* areas) # To write out veinmask veinout = np.zeros(t2s.shape) veinout[t2s!=0] = veinmaskf utils.filewrite(veinout, 'veinmaskf', ref_img) veinBout = utils.unmask(veinmaskB, mask) utils.filewrite(veinBout, 'veins50', ref_img) """ LGR.debug('Computing variance associated with low T2* areas (e.g., ' 'draining veins)') tsoc_B_Zcl = np.zeros(seldict['tsoc_B'].shape) tsoc_B_Zcl[seldict['Z_clmaps'] != 0] = np.abs(seldict['tsoc_B'])[seldict['Z_clmaps'] != 0] sig_B = [stats.scoreatpercentile(tsoc_B_Zcl[tsoc_B_Zcl[:, ii] != 0, ii], 25) if len(tsoc_B_Zcl[tsoc_B_Zcl[:, ii] != 0, ii]) != 0 else 0 for ii in all_comps] sig_B = np.abs(seldict['tsoc_B']) > np.tile(sig_B, [seldict['tsoc_B'].shape[0], 1]) veinmask = utils.andb([t2s < stats.scoreatpercentile(t2s[t2s != 0], 15, interpolation_method='lower'), t2s != 0]) == 2 veinmaskf = veinmask[mask] veinR = np.array(sig_B[veinmaskf].sum(0), dtype=float) / sig_B[~veinmaskf].sum(0) veinR[np.isnan(veinR)] = 0 veinc = np.union1d(rej, midk) rej_veinRZ = ((veinR-veinR[veinc].mean())/veinR[veinc].std())[veinc] rej_veinRZ[rej_veinRZ < 0] = 0 rej_veinRZ[countsigFR2[veinc] > np.array(veinmaskf, dtype=int).sum()] = 0 t2s_lim = [stats.scoreatpercentile(t2s[t2s != 0], 50, interpolation_method='lower'), stats.scoreatpercentile(t2s[t2s != 0], 80, interpolation_method='lower') / 2] phys_var_zs = [] for t2sl_i in range(len(t2s_lim)): t2sl = t2s_lim[t2sl_i] veinW = sig_B[:, veinc]*np.tile(rej_veinRZ, [sig_B.shape[0], 1]) veincand = utils.unmask(utils.andb([s0[t2s != 0] < np.median(s0[t2s != 0]), t2s[t2s != 0] < t2sl]) >= 1, t2s != 0)[mask] veinW[~veincand] = 0 invein = veinW.sum(axis=1)[(utils.unmask(veinmaskf, mask) * utils.unmask(veinW.sum(axis=1) > 1, mask))[mask]] minW = 10 * (np.log10(invein).mean()) - 1 * 10**(np.log10(invein).std()) veinmaskB = veinW.sum(axis=1) > minW tsoc_Bp = seldict['tsoc_B'].copy() tsoc_Bp[tsoc_Bp < 0] = 0 vvex = np.array([(tsoc_Bp[veinmaskB, ii]**2.).sum() / (tsoc_Bp[:, ii]**2.).sum() for ii in all_comps]) group0_res = np.intersect1d(KRguess, group0) phys_var_zs.append((vvex - vvex[group0_res].mean()) / vvex[group0_res].std()) veinBout = utils.unmask(veinmaskB, mask) utils.filewrite(veinBout.astype(float), 'veins_l%i' % t2sl_i, ref_img) # Mask to sample veins phys_var_z = np.array(phys_var_zs).max(0) Vz2 = (varex_log - varex_log[group0].mean())/varex_log[group0].std() """ Step 4: Learn joint TE-dependence spatial and temporal models to move remaining artifacts to ignore class """ LGR.debug('Learning joint TE-dependence spatial/temporal models to ignore remaining artifacts') to_ign = [] minK_ign = np.max([F05, getelbow_cons(seldict['Kappas'], return_val=True)]) newcest = len(group0) + len(toacc_hi[seldict['Kappas'][toacc_hi] > minK_ign]) phys_art = np.setdiff1d(all_comps[utils.andb([phys_var_z > 3.5, seldict['Kappas'] < minK_ign]) == 2], group0) rank_diff = stats.rankdata(phys_var_z) - stats.rankdata(seldict['Kappas']) phys_art = np.union1d(np.setdiff1d(all_comps[utils.andb([phys_var_z > 2, rank_diff > newcest / 2, Vz2 > -1]) == 3], group0), phys_art) # Want to replace field_art with an acf/SVM based approach # instead of a kurtosis/filter one field_art = np.setdiff1d(all_comps[utils.andb([mmix_kurt_z_max > 5, seldict['Kappas'] < minK_ign]) == 2], group0) temp = (stats.rankdata(mmix_kurt_z_max) - stats.rankdata(seldict['Kappas'])) > newcest / 2 field_art = np.union1d(np.setdiff1d(all_comps[utils.andb([mmix_kurt_z_max > 2, temp, Vz2 > 1, seldict['Kappas'] < F01]) == 4], group0), field_art) temp = seldict['Rhos'] > np.percentile(seldict['Rhos'][group0], 75) field_art = np.union1d(np.setdiff1d(all_comps[utils.andb([mmix_kurt_z_max > 3, Vz2 > 3, temp]) == 3], group0), field_art) field_art = np.union1d(np.setdiff1d(all_comps[utils.andb([mmix_kurt_z_max > 5, Vz2 > 5]) == 2], group0), field_art) misc_art = np.setdiff1d(all_comps[utils.andb([(stats.rankdata(Vz) - stats.rankdata(Ktz)) > newcest / 2, seldict['Kappas'] < Khighelbowval]) == 2], group0) ign_cand = np.unique(list(field_art)+list(phys_art)+list(misc_art)) midkrej = np.union1d(midk, rej) to_ign = np.setdiff1d(list(ign_cand), midkrej) toacc = np.union1d(toacc_hi, toacc_lo) acc_comps = np.setdiff1d(np.union1d(acc_comps, toacc), np.union1d(to_ign, midkrej)) ign = np.setdiff1d(all_comps, list(acc_comps) + list(midk) + list(rej)) orphan = np.setdiff1d(all_comps, list(acc_comps) + list(to_ign) + list(midk) + list(rej)) # Last ditch effort to save some transient components if not strict_mode: Vz3 = (varex_log - varex_log[acc_comps].mean()) / varex_log[acc_comps].std() temp = utils.andb([seldict['Kappas'] > F05, seldict['Rhos'] < F025, seldict['Kappas'] > seldict['Rhos'], Vz3 <= -1, Vz3 > -3, mmix_kurt_z_max < 2.5]) acc_comps = np.union1d(acc_comps, np.intersect1d(orphan, all_comps[temp == 6])) ign = np.setdiff1d(all_comps, list(acc_comps)+list(midk)+list(rej)) orphan = np.setdiff1d(all_comps, list(acc_comps) + list(to_ign) + list(midk) + list(rej)) if savecsdiag: diagstep_keys = ['Rejected components', 'Kappa-Rho cut point', 'Kappa cut', 'Rho cut', 'DBSCAN failed to converge', 'Kappa-Rho guess', 'Dice rejected', 'rej_supp', 'to_clf', 'Mid-kappa components', 'svm_acc_fail', 'toacc_hi', 'toacc_lo', 'Field artifacts', 'Physiological artifacts', 'Miscellaneous artifacts', 'acc_comps', 'Ignored components'] diagstep_vals = [list(rej), KRcut.item(), Kcut.item(), Rcut.item(), dbscanfailed, list(KRguess), dice_rej, list(rej_supp), list(to_clf), list(midk), svm_acc_fail, list(toacc_hi), list(toacc_lo), list(field_art), list(phys_art), list(misc_art), list(acc_comps), list(ign)] with open('csstepdata.json', 'w') as ofh: json.dump(dict(zip(diagstep_keys, diagstep_vals)), ofh, indent=4, sort_keys=True, default=str) allfz = np.array([Tz, Vz, Ktz, KRr, cnz, Rz, mmix_kurt, fdist_z]) np.savetxt('csdata.txt', allfz) return list(sorted(acc_comps)), list(sorted(rej)), list(sorted(midk)), list(sorted(ign))
def selcomps(seldict, comptable, mmix, manacc, n_echos): """ Classify components in seldict as "accepted," "rejected," "midk," or "ignored." The selection process uses previously calculated parameters listed in `seldict` for each ICA component such as Kappa (a T2* weighting metric), Rho (an S0 weighting metric), and variance explained. See `Notes` for additional calculated metrics used to classify each component into one of the four listed groups. Parameters ---------- seldict : :obj:`dict` A dictionary with component-specific features used for classification. As output from `fitmodels_direct` comptable : (C x 5) :obj:`pandas.DataFrame` Component metric table mmix : (T x C) array_like Mixing matrix for converting input data to component space, where `C` is components and `T` is the number of volumes in the original data manacc : :obj:`list` Comma-separated list of indices of manually accepted components n_echos : :obj:`int` Number of echos in original data Returns ------- comptable : :obj:`pandas.DataFrame` Updated component table with additional metrics and with classification (accepted, rejected, midk, or ignored) Notes ----- The selection algorithm used in this function was originated in ME-ICA by Prantik Kundu, and his original implementation is available at: https://github.com/ME-ICA/me-ica/blob/b2781dd087ab9de99a2ec3925f04f02ce84f0adc/meica.libs/select_model.py This component selection process uses multiple, previously calculated metrics that include: kappa, rho, variance explained, component spatial weighting maps, noise and spatial frequency metrics, and measures of spatial overlap across metrics. Prantik began to update these selection criteria to use SVMs to distinguish components, a hypercommented version of this attempt is available at: https://gist.github.com/emdupre/ca92d52d345d08ee85e104093b81482e """ cols_at_end = ['classification', 'rationale'] comptable['classification'] = 'accepted' comptable['rationale'] = '' Z_maps = seldict['Z_maps'] Z_clmaps = seldict['Z_clmaps'] F_R2_maps = seldict['F_R2_maps'] F_S0_clmaps = seldict['F_S0_clmaps'] F_R2_clmaps = seldict['F_R2_clmaps'] Br_S0_clmaps = seldict['Br_S0_clmaps'] Br_R2_clmaps = seldict['Br_R2_clmaps'] n_vols, n_comps = mmix.shape # Set knobs LOW_PERC = 25 HIGH_PERC = 90 if n_vols < 100: EXTEND_FACTOR = 3 else: EXTEND_FACTOR = 2 RESTRICT_FACTOR = 2 # List of components midk = [] ign = [] all_comps = np.arange(comptable.shape[0]) acc = np.arange(comptable.shape[0]) # If user has specified if manacc: acc = sorted([int(vv) for vv in manacc.split(',')]) rej = sorted(np.setdiff1d(all_comps, acc)) comptable.loc[acc, 'classification'] = 'accepted' comptable.loc[rej, 'classification'] = 'rejected' comptable.loc[rej, 'rationale'] += 'I001;' # Move decision columns to end comptable = comptable[[c for c in comptable if c not in cols_at_end] + [c for c in cols_at_end if c in comptable]] return comptable """ Do some tallies for no. of significant voxels """ countnoise = np.zeros(n_comps) comptable['countsigFR2'] = F_R2_clmaps.sum(axis=0) comptable['countsigFS0'] = F_S0_clmaps.sum(axis=0) """ Make table of dice values """ comptable['dice_FR2'] = np.zeros(all_comps.shape[0]) comptable['dice_FS0'] = np.zeros(all_comps.shape[0]) for i_comp in acc: comptable.loc[i_comp, 'dice_FR2'] = utils.dice(Br_R2_clmaps[:, i_comp], F_R2_clmaps[:, i_comp]) comptable.loc[i_comp, 'dice_FS0'] = utils.dice(Br_S0_clmaps[:, i_comp], F_S0_clmaps[:, i_comp]) comptable.loc[np.isnan(comptable['dice_FR2']), 'dice_FR2'] = 0 comptable.loc[np.isnan(comptable['dice_FS0']), 'dice_FS0'] = 0 """ Make table of noise gain """ comptable['countnoise'] = 0 comptable['signal-noise_t'] = 0 comptable['signal-noise_p'] = 0 for i_comp in all_comps: comp_noise_sel = ((np.abs(Z_maps[:, i_comp]) > 1.95) & (Z_clmaps[:, i_comp] == 0)) comptable.loc[i_comp, 'countnoise'] = np.array(comp_noise_sel, dtype=np.int).sum() noise_FR2_Z = np.log10(np.unique(F_R2_maps[comp_noise_sel, i_comp])) signal_FR2_Z = np.log10( np.unique(F_R2_maps[Z_clmaps[:, i_comp] == 1, i_comp])) (comptable.loc[i_comp, 'signal-noise_t'], comptable.loc[i_comp, 'signal-noise_p']) = stats.ttest_ind(signal_FR2_Z, noise_FR2_Z, equal_var=False) comptable.loc[np.isnan(comptable['signal-noise_t']), 'signal-noise_t'] = 0 comptable.loc[np.isnan(comptable['signal-noise_p']), 'signal-noise_p'] = 0 """ Assemble decision table """ d_table_rank = np.vstack([ n_comps - stats.rankdata(comptable['kappa'], method='ordinal'), n_comps - stats.rankdata(comptable['dice_FR2'], method='ordinal'), n_comps - stats.rankdata(comptable['signal-noise_t'], method='ordinal'), stats.rankdata(countnoise, method='ordinal'), n_comps - stats.rankdata(comptable['countsigFR2'], method='ordinal') ]).T n_decision_metrics = d_table_rank.shape[1] comptable['d_table_score'] = d_table_rank.sum(axis=1) """ Step 1: Reject anything that's obviously an artifact a. Estimate a null variance """ temp_rej0 = all_comps[(comptable['rho'] > comptable['kappa']) | ( (comptable['countsigFS0'] > comptable['countsigFR2']) & (comptable['countsigFR2'] > 0))] comptable.loc[temp_rej0, 'classification'] = 'rejected' comptable.loc[temp_rej0, 'rationale'] += 'I002;' temp_rej1 = all_comps[(comptable['dice_FS0'] > comptable['dice_FR2']) & (comptable['variance explained'] > np.median( comptable['variance explained']))] comptable.loc[temp_rej1, 'classification'] = 'rejected' comptable.loc[temp_rej1, 'rationale'] += 'I003;' rej = np.union1d(temp_rej0, temp_rej1) temp_rej2 = acc[(comptable.loc[acc, 'signal-noise_t'] < 0) & (comptable.loc[acc, 'variance explained'] > np.median( comptable['variance explained']))] comptable.loc[temp_rej2, 'classification'] = 'rejected' comptable.loc[temp_rej2, 'rationale'] += 'I004;' rej = np.union1d(temp_rej2, rej) acc = np.setdiff1d(acc, rej) """ Step 2: Make a guess for what the good components are, in order to estimate good component properties a. Not outlier variance b. Kappa>kappa_elbow c. Rho<Rho_elbow d. High R2* dice compared to S0 dice e. Gain of F_R2 in clusters vs noise f. Estimate a low and high variance """ # Step 2a varex_upper_p = np.median(comptable.loc[ comptable['kappa'] > getelbow(comptable['kappa'], return_val=True), 'variance explained']) ncls = acc.copy() # NOTE: We're not sure why this is done, nor why it's specifically done # three times. Need to look into this deeper, esp. to make sure the 3 # isn't a hard-coded reference to the number of echoes. for nn in range(3): ncls = comptable.loc[ncls].loc[comptable.loc[ ncls, 'variance explained'].diff() < varex_upper_p].index.values # Compute elbows kappas_lim = comptable.loc[ comptable['kappa'] < utils.getfbounds(n_echos)[-1], 'kappa'] kappa_elbow = np.min((getelbow(kappas_lim, return_val=True), getelbow(comptable['kappa'], return_val=True))) rho_elbow = np.mean( (getelbow(comptable.loc[ncls, 'rho'], return_val=True), getelbow(comptable['rho'], return_val=True), utils.getfbounds(n_echos)[0])) # Initial guess of good components based on Kappa and Rho elbows good_guess = ncls[(comptable.loc[ncls, 'kappa'] >= kappa_elbow) & (comptable.loc[ncls, 'rho'] < rho_elbow)] if len(good_guess) == 0: LGR.warning('No BOLD-like components detected') ign = sorted(np.setdiff1d(all_comps, rej)) comptable.loc[ign, 'classification'] = 'ignored' comptable.loc[ign, 'rationale'] += 'I005;' # Move decision columns to end comptable = comptable[[c for c in comptable if c not in cols_at_end] + [c for c in cols_at_end if c in comptable]] return comptable kappa_rate = ((np.max(comptable.loc[good_guess, 'kappa']) - np.min(comptable.loc[good_guess, 'kappa'])) / (np.max(comptable.loc[good_guess, 'variance explained']) - np.min(comptable.loc[good_guess, 'variance explained']))) kappa_ratios = kappa_rate * comptable['variance explained'] / comptable[ 'kappa'] varex_lower = stats.scoreatpercentile( comptable.loc[good_guess, 'variance explained'], LOW_PERC) varex_upper = stats.scoreatpercentile( comptable.loc[good_guess, 'variance explained'], HIGH_PERC) """ Step 3: Get rid of midk components; i.e., those with higher than max decision score and high variance """ max_good_d_score = EXTEND_FACTOR * len(good_guess) * n_decision_metrics midk = acc[(comptable.loc[acc, 'd_table_score'] > max_good_d_score) & (comptable.loc[acc, 'variance explained'] > EXTEND_FACTOR * varex_upper)] comptable.loc[midk, 'classification'] = 'rejected' comptable.loc[midk, 'rationale'] += 'I006;' acc = np.setdiff1d(acc, midk) """ Step 4: Find components to ignore """ good_guess = np.setdiff1d(good_guess, midk) loaded = np.union1d( good_guess, acc[comptable.loc[acc, 'variance explained'] > varex_lower]) ign = np.setdiff1d(acc, loaded) ign = np.setdiff1d( ign, ign[comptable.loc[ign, 'd_table_score'] < max_good_d_score]) ign = np.setdiff1d(ign, ign[comptable.loc[ign, 'kappa'] > kappa_elbow]) comptable.loc[ign, 'classification'] = 'ignored' comptable.loc[ign, 'rationale'] += 'I007;' acc = np.setdiff1d(acc, ign) """ Step 5: Scrub the set """ if len(acc) > len(good_guess): # Recompute the midk steps on the limited set to clean up the tail d_table_rank = np.vstack([ len(acc) - stats.rankdata(comptable.loc[acc, 'kappa'], method='ordinal'), len(acc) - stats.rankdata(comptable.loc[acc, 'dice_FR2'], method='ordinal'), len(acc) - stats.rankdata(comptable.loc[acc, 'signal-noise_t'], method='ordinal'), stats.rankdata(countnoise[acc], method='ordinal'), len(acc) - stats.rankdata(comptable.loc[acc, 'countsigFR2'], method='ordinal') ]).T comptable['d_table_score_scrub'] = np.nan comptable.loc[acc, 'd_table_score_scrub'] = d_table_rank.sum(1) num_acc_guess = int( np.mean([ np.sum((comptable.loc[acc, 'kappa'] > kappa_elbow) & (comptable.loc[acc, 'rho'] < rho_elbow)), np.sum(comptable.loc[acc, 'kappa'] > kappa_elbow) ])) conservative_guess = num_acc_guess * n_decision_metrics / RESTRICT_FACTOR # Rejection candidate based on artifact type A: candartA candartA = np.intersect1d( acc[comptable.loc[acc, 'd_table_score_scrub'] > conservative_guess], acc[kappa_ratios[acc] > EXTEND_FACTOR * 2]) candartA = np.intersect1d( candartA, candartA[comptable.loc[candartA, 'variance explained'] > varex_upper * EXTEND_FACTOR]) comptable.loc[candartA, 'classification'] = 'rejected' comptable.loc[candartA, 'rationale'] += 'I008;' midk = np.union1d(midk, candartA) # Rejection candidate based on artifact type B: candartB candartB = comptable.loc[acc].loc[ comptable.loc[acc, 'd_table_score_scrub'] > num_acc_guess * n_decision_metrics * HIGH_PERC / 100.].index.values candartB = np.intersect1d( candartB, candartB[comptable.loc[candartB, 'variance explained'] > varex_lower * EXTEND_FACTOR]) midk = np.union1d(midk, candartB) comptable.loc[candartB, 'classification'] = 'rejected' comptable.loc[candartB, 'rationale'] += 'I009;' # Find comps to ignore new_varex_lower = stats.scoreatpercentile( comptable.loc[acc[:num_acc_guess], 'variance explained'], LOW_PERC) candart = comptable.loc[acc].loc[ comptable.loc[acc, 'd_table_score'] > num_acc_guess * n_decision_metrics].index.values ign_add0 = np.intersect1d( candart[comptable.loc[candart, 'variance explained'] > new_varex_lower], candart) ign_add0 = np.setdiff1d(ign_add0, midk) comptable.loc[ign_add0, 'classification'] = 'ignored' comptable.loc[ign_add0, 'rationale'] += 'I010;' ign = np.union1d(ign, ign_add0) ign_add1 = np.intersect1d( acc[comptable.loc[acc, 'kappa'] <= kappa_elbow], acc[comptable.loc[acc, 'variance explained'] > new_varex_lower]) ign_add1 = np.setdiff1d(ign_add1, midk) comptable.loc[ign_add1, 'classification'] = 'ignored' comptable.loc[ign_add1, 'rationale'] += 'I011;' ign = np.union1d(ign, ign_add1) acc = np.setdiff1d(acc, np.union1d(midk, ign)) # Move decision columns to end comptable = comptable[[c for c in comptable if c not in cols_at_end] + [c for c in cols_at_end if c in comptable]] return comptable
def selcomps(seldict, mmix, mask, ref_img, manacc, n_echos, t2s, s0, olevel=2, oversion=99, filecsdata=True, savecsdiag=True, strict_mode=False): """ Labels components in `mmix` Parameters ---------- seldict : :obj:`dict` As output from `fitmodels_direct` mmix : (C x T) array_like Mixing matrix for converting input data to component space, where `C` is components and `T` is the number of volumes in the original data mask : (S,) array_like Boolean mask array ref_img : str or img_like Reference image to dictate how outputs are saved to disk manacc : list Comma-separated list of indices of manually accepted components n_echos : int Number of echos in original data t2s : (S,) array_like s0 : (S,) array_like olevel : int, optional Default: 2 oversion : int, optional Default: 99 filecsdata: bool, optional Default: False savecsdiag: bool, optional Default: True strict_mode: bool, optional Default: False Returns ------- acc : list Indices of accepted (BOLD) components in `mmix` rej : list Indices of rejected (non-BOLD) components in `mmix` midk : list Indices of mid-K (questionable) components in `mmix` ign : list Indices of ignored components in `mmix` """ if filecsdata: import bz2 if seldict is not None: LGR.info('Saving component selection data') with bz2.BZ2File('compseldata.pklbz', 'wb') as csstate_f: pickle.dump(seldict, csstate_f) else: try: with bz2.BZ2File('compseldata.pklbz', 'rb') as csstate_f: seldict = pickle.load(csstate_f) except FileNotFoundError: LGR.warning('Failed to load component selection data') return None # List of components midk = [] ign = [] nc = np.arange(len(seldict['Kappas'])) ncl = np.arange(len(seldict['Kappas'])) # If user has specified components to accept manually if manacc: acc = sorted([int(vv) for vv in manacc.split(',')]) midk = [] rej = sorted(np.setdiff1d(ncl, acc)) return acc, rej, midk, [] # Add string for ign """ Do some tallies for no. of significant voxels """ countsigFS0 = seldict['F_S0_clmaps'].sum(0) countsigFR2 = seldict['F_R2_clmaps'].sum(0) countnoise = np.zeros(len(nc)) """ Make table of dice values """ dice_tbl = np.zeros([nc.shape[0], 2]) for ii in ncl: dice_FR2 = utils.dice(utils.unmask(seldict['Br_clmaps_R2'][:, ii], mask)[t2s != 0], seldict['F_R2_clmaps'][:, ii]) dice_FS0 = utils.dice(utils.unmask(seldict['Br_clmaps_S0'][:, ii], mask)[t2s != 0], seldict['F_S0_clmaps'][:, ii]) dice_tbl[ii, :] = [dice_FR2, dice_FS0] # step 3a here and above dice_tbl[np.isnan(dice_tbl)] = 0 """ Make table of noise gain """ tt_table = np.zeros([len(nc), 4]) counts_FR2_Z = np.zeros([len(nc), 2]) for ii in nc: comp_noise_sel = utils.andb([np.abs(seldict['Z_maps'][:, ii]) > 1.95, seldict['Z_clmaps'][:, ii] == 0]) == 2 countnoise[ii] = np.array(comp_noise_sel, dtype=np.int).sum() noise_FR2_Z_mask = utils.unmask(comp_noise_sel, mask)[t2s != 0] noise_FR2_Z = np.log10(np.unique(seldict['F_R2_maps'][noise_FR2_Z_mask, ii])) signal_FR2_Z_mask = utils.unmask(seldict['Z_clmaps'][:, ii], mask)[t2s != 0] == 1 signal_FR2_Z = np.log10(np.unique(seldict['F_R2_maps'][signal_FR2_Z_mask, ii])) counts_FR2_Z[ii, :] = [len(signal_FR2_Z), len(noise_FR2_Z)] try: ttest = stats.ttest_ind(signal_FR2_Z, noise_FR2_Z, equal_var=True) # avoid DivideByZero RuntimeWarning if signal_FR2_Z.size > 0 and noise_FR2_Z.size > 0: mwu = stats.norm.ppf(stats.mannwhitneyu(signal_FR2_Z, noise_FR2_Z)[1]) else: mwu = -np.inf tt_table[ii, 0] = np.abs(mwu) * ttest[0] / np.abs(ttest[0]) tt_table[ii, 1] = ttest[1] except Exception: # TODO: what is the error that might be caught here? pass tt_table[np.isnan(tt_table)] = 0 tt_table[np.isinf(tt_table[:, 0]), 0] = np.percentile(tt_table[~np.isinf(tt_table[:, 0]), 0], 98) # Time series derivative kurtosis mmix_dt = (mmix[:-1] - mmix[1:]) mmix_kurt = stats.kurtosis(mmix_dt) mmix_std = np.std(mmix_dt, axis=0) """ Step 1: Reject anything that's obviously an artifact a. Estimate a null variance """ LGR.debug('Rejecting gross artifacts based on Rho/Kappa values and S0/R2 counts') rej = ncl[utils.andb([seldict['Rhos'] > seldict['Kappas'], countsigFS0 > countsigFR2]) > 0] ncl = np.setdiff1d(ncl, rej) """ Step 2: Compute 3-D spatial FFT of Beta maps to detect high-spatial frequency artifacts """ LGR.debug('Computing 3D spatial FFT of beta maps to detect high-spatial frequency artifacts') # spatial information is important so for NIFTI we convert back to 3D space if utils.get_dtype(ref_img) == 'NIFTI': dim1 = np.prod(ref_img.shape[:2]) else: dim1 = mask.shape[0] fproj_arr = np.zeros([dim1, len(nc)]) fproj_arr_val = np.zeros([dim1, len(nc)]) spr = [] fdist = [] for ii in nc: # convert data back to 3D array if utils.get_dtype(ref_img) == 'NIFTI': tproj = utils.new_nii_like(ref_img, utils.unmask(seldict['PSC'], mask)[:, ii]).get_data() else: tproj = utils.unmask(seldict['PSC'], mask)[:, ii] fproj = np.fft.fftshift(np.abs(np.fft.rfftn(tproj))) fproj_z = fproj.max(axis=2) fproj[fproj == fproj.max()] = 0 fproj_arr[:, ii] = stats.rankdata(fproj_z.flatten()) fproj_arr_val[:, ii] = fproj_z.flatten() spr.append(np.array(fproj_z > fproj_z.max() / 4, dtype=np.int).sum()) fprojr = np.array([fproj, fproj[:, :, ::-1]]).max(0) fdist.append(np.max([utils.fitgaussian(fproj.max(jj))[3:].max() for jj in range(fprojr.ndim)])) fdist = np.array(fdist) spr = np.array(spr) """ Step 3: Create feature space of component properties """ LGR.debug('Creating feature space of component properties') fdist_pre = fdist.copy() fdist_pre[fdist > np.median(fdist) * 3] = np.median(fdist) * 3 fdist_z = (fdist_pre - np.median(fdist_pre)) / fdist_pre.std() spz = (spr-spr.mean())/spr.std() Tz = (tt_table[:, 0] - tt_table[:, 0].mean()) / tt_table[:, 0].std() varex_ = np.log(seldict['varex']) Vz = (varex_-varex_.mean()) / varex_.std() Rz = (seldict['Rhos'] - seldict['Rhos'].mean()) / seldict['Rhos'].std() Ktz = np.log(seldict['Kappas']) / 2 Ktz = (Ktz-Ktz.mean()) / Ktz.std() Rtz = np.log(seldict['Rhos']) / 2 Rtz = (Rtz-Rtz.mean())/Rtz.std() KRr = stats.zscore(np.log(seldict['Kappas']) / np.log(seldict['Rhos'])) cnz = (countnoise-countnoise.mean()) / countnoise.std() Dz = stats.zscore(np.arctanh(dice_tbl[:, 0] + 0.001)) fz = np.array([Tz, Vz, Ktz, KRr, cnz, Rz, mmix_kurt, fdist_z]) """ Step 3: Make initial guess of where BOLD components are and use DBSCAN to exclude noise components and find a sample set of 'good' components """ LGR.debug('Making initial guess of BOLD components') # epsmap is [index,level of overlap with dicemask, # number of high Rho components] F05, F025, F01 = utils.getfbounds(n_echos) epsmap = [] Rhos_sorted = np.array(sorted(seldict['Rhos']))[::-1] # Make an initial guess as to number of good components based on # consensus of control points across Rhos and Kappas KRcutguesses = [getelbow_mod(seldict['Rhos']), getelbow_cons(seldict['Rhos']), getelbow_aggr(seldict['Rhos']), getelbow_mod(seldict['Kappas']), getelbow_cons(seldict['Kappas']), getelbow_aggr(seldict['Kappas'])] Khighelbowval = stats.scoreatpercentile([getelbow_mod(seldict['Kappas'], val=True), getelbow_cons(seldict['Kappas'], val=True), getelbow_aggr(seldict['Kappas'], val=True)] + list(utils.getfbounds(n_echos)), 75, interpolation_method='lower') KRcut = np.median(KRcutguesses) # only use exclusive when inclusive is extremely inclusive - double KRcut cond1 = getelbow_cons(seldict['Kappas']) > KRcut * 2 cond2 = getelbow_mod(seldict['Kappas'], val=True) < F01 if cond1 and cond2: Kcut = getelbow_mod(seldict['Kappas'], val=True) else: Kcut = getelbow_cons(seldict['Kappas'], val=True) # only use inclusive when exclusive is extremely exclusive - half KRcut # (remember for Rho inclusive is higher, so want both Kappa and Rho # to defaut to lower) if getelbow_cons(seldict['Rhos']) > KRcut * 2: Rcut = getelbow_mod(seldict['Rhos'], val=True) # for above, consider something like: # min([getelbow_mod(Rhos,True),sorted(Rhos)[::-1][KRguess] ]) else: Rcut = getelbow_cons(seldict['Rhos'], val=True) if Rcut > Kcut: Kcut = Rcut # Rcut should never be higher than Kcut KRelbow = utils.andb([seldict['Kappas'] > Kcut, seldict['Rhos'] < Rcut]) # Make guess of Kundu et al 2011 plus remove high frequencies, # generally high variance, and high variance given low Kappa tt_lim = stats.scoreatpercentile(tt_table[tt_table[:, 0] > 0, 0], 75, interpolation_method='lower') / 3 KRguess = np.setdiff1d(np.setdiff1d(nc[KRelbow == 2], rej), np.union1d(nc[tt_table[:, 0] < tt_lim], np.union1d(np.union1d(nc[spz > 1], nc[Vz > 2]), nc[utils.andb([seldict['varex'] > 0.5 * sorted(seldict['varex'])[::-1][int(KRcut)], seldict['Kappas'] < 2*Kcut]) == 2]))) guessmask = np.zeros(len(nc)) guessmask[KRguess] = 1 # Throw lower-risk bad components out rejB = ncl[utils.andb([tt_table[ncl, 0] < 0, seldict['varex'][ncl] > np.median(seldict['varex']), ncl > KRcut]) == 3] rej = np.union1d(rej, rejB) ncl = np.setdiff1d(ncl, rej) LGR.debug('Using DBSCAN to find optimal set of "good" BOLD components') for ii in range(20000): eps = .005 + ii * .005 db = DBSCAN(eps=eps, min_samples=3).fit(fz.T) # it would be great to have descriptive names, here # DBSCAN found at least three non-noisy clusters cond1 = db.labels_.max() > 1 # DBSCAN didn't detect more classes than the total # of components / 6 cond2 = db.labels_.max() < len(nc) / 6 # TODO: confirm if 0 is a special label for DBSCAN # my intuition here is that we're confirming DBSCAN labelled previously # rejected components as noise (i.e., no overlap between `rej` and # labelled DBSCAN components) cond3 = np.intersect1d(rej, nc[db.labels_ == 0]).shape[0] == 0 # DBSCAN labelled less than half of the total components as noisy cond4 = np.array(db.labels_ == -1, dtype=int).sum() / float(len(nc)) < .5 if cond1 and cond2 and cond3 and cond4: epsmap.append([ii, utils.dice(guessmask, db.labels_ == 0), np.intersect1d(nc[db.labels_ == 0], nc[seldict['Rhos'] > getelbow_mod(Rhos_sorted, val=True)]).shape[0]]) db = None epsmap = np.array(epsmap) LGR.debug('Found DBSCAN solutions for {}/20000 eps resolutions'.format(len(epsmap))) group0 = [] dbscanfailed = False if len(epsmap) != 0: # Select index that maximizes Dice with guessmask but first # minimizes number of higher Rho components ii = int(epsmap[np.argmax(epsmap[epsmap[:, 2] == np.min(epsmap[:, 2]), 1], 0), 0]) LGR.debug('Component selection tuning: {:.05f}'.format(epsmap[:, 1].max())) db = DBSCAN(eps=.005+ii*.005, min_samples=3).fit(fz.T) ncl = nc[db.labels_ == 0] ncl = np.setdiff1d(ncl, rej) ncl = np.setdiff1d(ncl, ncl[ncl > len(nc) - len(rej)]) group0 = ncl.copy() group_n1 = nc[db.labels_ == -1] to_clf = np.setdiff1d(nc, np.union1d(ncl, rej)) if len(group0) == 0 or len(group0) < len(KRguess) * .5: dbscanfailed = True LGR.debug('DBSCAN guess failed; using elbow guess method instead') ncl = np.setdiff1d(np.setdiff1d(nc[KRelbow == 2], rej), np.union1d(nc[tt_table[:, 0] < tt_lim], np.union1d(np.union1d(nc[spz > 1], nc[Vz > 2]), nc[utils.andb([seldict['varex'] > 0.5 * sorted(seldict['varex'])[::-1][int(KRcut)], seldict['Kappas'] < 2 * Kcut]) == 2]))) group0 = ncl.copy() group_n1 = [] to_clf = np.setdiff1d(nc, np.union1d(group0, rej)) if len(group0) < 2 or (len(group0) < 4 and float(len(rej))/len(group0) > 3): LGR.warning('Extremely limited reliable BOLD signal space! ' 'Not filtering components beyond BOLD/non-BOLD guesses.') midkfailed = True min_acc = np.array([]) if len(group0) != 0: # For extremes, building in a 20% tolerance toacc_hi = np.setdiff1d(nc[utils.andb([fdist <= np.max(fdist[group0]), seldict['Rhos'] < F025, Vz > -2]) == 3], np.union1d(group0, rej)) min_acc = np.union1d(group0, toacc_hi) to_clf = np.setdiff1d(nc, np.union1d(min_acc, rej)) diagstep_keys = ['Rejected components', 'Kappa-Rho cut point', 'Kappa cut point', 'Rho cut point', 'DBSCAN failed to converge', 'Mid-Kappa failed (limited BOLD signal)', 'Kappa-Rho guess', 'min_acc', 'toacc_hi'] diagstep_vals = [rej.tolist(), KRcut, Kcut, Rcut, dbscanfailed, midkfailed, KRguess.tolist(), min_acc.tolist(), toacc_hi.tolist()] with open('csstepdata.json', 'w') as ofh: json.dump(dict(zip(diagstep_keys, diagstep_vals)), ofh, indent=4, sort_keys=True) return list(sorted(min_acc)), list(sorted(rej)), [], list(sorted(to_clf)) # Find additional components to reject based on Dice - doing this here # since Dice is a little unstable, need to reference group0 rej_supp = [] dice_rej = False if not dbscanfailed and len(rej) + len(group0) < 0.75 * len(nc): dice_rej = True rej_supp = np.setdiff1d(np.setdiff1d(np.union1d(rej, nc[dice_tbl[nc, 0] <= dice_tbl[nc, 1]]), group0), group_n1) rej = np.union1d(rej, rej_supp) # Temporal features # larger is worse - spike mmix_kurt_z = (mmix_kurt-mmix_kurt[group0].mean()) / mmix_kurt[group0].std() # smaller is worse - drift mmix_std_z = -1 * ((mmix_std-mmix_std[group0].mean()) / mmix_std[group0].std()) mmix_kurt_z_max = np.max([mmix_kurt_z, mmix_std_z], 0) """ Step 2: Classifiy midk and ignore using separte SVMs for different variance regimes # To render hyperplane: min_x = np.min(spz2);max_x=np.max(spz2) # plotting separating hyperplane ww = clf_.coef_[0] aa = -ww[0] / ww[1] # make sure the next line is long enough xx = np.linspace(min_x - 2, max_x + 2) yy = aa * xx - (clf_.intercept_[0]) / ww[1] plt.plot(xx, yy, '-') """ LGR.debug('Attempting to classify midk components') # Tried getting rid of accepting based on SVM altogether, # now using only rejecting toacc_hi = np.setdiff1d(nc[utils.andb([fdist <= np.max(fdist[group0]), seldict['Rhos'] < F025, Vz > -2]) == 3], np.union1d(group0, rej)) toacc_lo = np.intersect1d(to_clf, nc[utils.andb([spz < 1, Rz < 0, mmix_kurt_z_max < 5, Dz > -1, Tz > -1, Vz < 0, seldict['Kappas'] >= F025, fdist < 3 * np.percentile(fdist[group0], 98)]) == 8]) midk_clf, clf_ = do_svm(fproj_arr_val[:, np.union1d(group0, rej)].T, [0] * len(group0) + [1] * len(rej), fproj_arr_val[:, to_clf].T, svmtype=2) midk = np.setdiff1d(to_clf[utils.andb([midk_clf == 1, seldict['varex'][to_clf] > np.median(seldict['varex'][group0])]) == 2], np.union1d(toacc_hi, toacc_lo)) # only use SVM to augment toacc_hi only if toacc_hi isn't already # conflicting with SVM choice if len(np.intersect1d(to_clf[utils.andb([midk_clf == 1, Vz[to_clf] > 0]) == 2], toacc_hi)) == 0: svm_acc_fail = True toacc_hi = np.union1d(toacc_hi, to_clf[midk_clf == 0]) else: svm_acc_fail = False """ Step 3: Compute variance associated with low T2* areas (e.g. draining veins and low T2* areas) # To write out veinmask veinout = np.zeros(t2s.shape) veinout[t2s!=0] = veinmaskf utils.filewrite(veinout, 'veinmaskf', ref_img) veinBout = utils.unmask(veinmaskB, mask) utils.filewrite(veinBout, 'veins50', ref_img) """ LGR.debug('Computing variance associated with low T2* areas (e.g., draining veins)') tsoc_B_Zcl = np.zeros(seldict['tsoc_B'].shape) tsoc_B_Zcl[seldict['Z_clmaps'] != 0] = np.abs(seldict['tsoc_B'])[seldict['Z_clmaps'] != 0] sig_B = [stats.scoreatpercentile(tsoc_B_Zcl[tsoc_B_Zcl[:, ii] != 0, ii], 25) if len(tsoc_B_Zcl[tsoc_B_Zcl[:, ii] != 0, ii]) != 0 else 0 for ii in nc] sig_B = np.abs(seldict['tsoc_B']) > np.tile(sig_B, [seldict['tsoc_B'].shape[0], 1]) veinmask = utils.andb([t2s < stats.scoreatpercentile(t2s[t2s != 0], 15, interpolation_method='lower'), t2s != 0]) == 2 veinmaskf = veinmask[mask] veinR = np.array(sig_B[veinmaskf].sum(0), dtype=float) / sig_B[~veinmaskf].sum(0) veinR[np.isnan(veinR)] = 0 veinc = np.union1d(rej, midk) rej_veinRZ = ((veinR-veinR[veinc].mean())/veinR[veinc].std())[veinc] rej_veinRZ[rej_veinRZ < 0] = 0 rej_veinRZ[countsigFR2[veinc] > np.array(veinmaskf, dtype=int).sum()] = 0 t2s_lim = [stats.scoreatpercentile(t2s[t2s != 0], 50, interpolation_method='lower'), stats.scoreatpercentile(t2s[t2s != 0], 80, interpolation_method='lower') / 2] phys_var_zs = [] for t2sl_i in range(len(t2s_lim)): t2sl = t2s_lim[t2sl_i] veinW = sig_B[:, veinc]*np.tile(rej_veinRZ, [sig_B.shape[0], 1]) veincand = utils.unmask(utils.andb([s0[t2s != 0] < np.median(s0[t2s != 0]), t2s[t2s != 0] < t2sl]) >= 1, t2s != 0)[mask] veinW[~veincand] = 0 invein = veinW.sum(axis=1)[(utils.unmask(veinmaskf, mask) * utils.unmask(veinW.sum(axis=1) > 1, mask))[mask]] minW = 10 * (np.log10(invein).mean()) - 1 * 10**(np.log10(invein).std()) veinmaskB = veinW.sum(axis=1) > minW tsoc_Bp = seldict['tsoc_B'].copy() tsoc_Bp[tsoc_Bp < 0] = 0 vvex = np.array([(tsoc_Bp[veinmaskB, ii]**2.).sum() / (tsoc_Bp[:, ii]**2.).sum() for ii in nc]) group0_res = np.intersect1d(KRguess, group0) phys_var_zs.append((vvex - vvex[group0_res].mean()) / vvex[group0_res].std()) veinBout = utils.unmask(veinmaskB, mask) utils.filewrite(veinBout.astype(float), 'veins_l%i' % t2sl_i, ref_img) # Mask to sample veins phys_var_z = np.array(phys_var_zs).max(0) Vz2 = (varex_ - varex_[group0].mean())/varex_[group0].std() """ Step 4: Learn joint TE-dependence spatial and temporal models to move remaining artifacts to ignore class """ LGR.debug('Learning joint TE-dependence spatial/temporal models to ignore remaining artifacts') to_ign = [] minK_ign = np.max([F05, getelbow_cons(seldict['Kappas'], val=True)]) newcest = len(group0) + len(toacc_hi[seldict['Kappas'][toacc_hi] > minK_ign]) phys_art = np.setdiff1d(nc[utils.andb([phys_var_z > 3.5, seldict['Kappas'] < minK_ign]) == 2], group0) rank_diff = stats.rankdata(phys_var_z) - stats.rankdata(seldict['Kappas']) phys_art = np.union1d(np.setdiff1d(nc[utils.andb([phys_var_z > 2, rank_diff > newcest / 2, Vz2 > -1]) == 3], group0), phys_art) # Want to replace field_art with an acf/SVM based approach # instead of a kurtosis/filter one field_art = np.setdiff1d(nc[utils.andb([mmix_kurt_z_max > 5, seldict['Kappas'] < minK_ign]) == 2], group0) field_art = np.union1d(np.setdiff1d(nc[utils.andb([mmix_kurt_z_max > 2, (stats.rankdata(mmix_kurt_z_max) - stats.rankdata(seldict['Kappas'])) > newcest / 2, Vz2 > 1, seldict['Kappas'] < F01]) == 4], group0), field_art) field_art = np.union1d(np.setdiff1d(nc[utils.andb([mmix_kurt_z_max > 3, Vz2 > 3, seldict['Rhos'] > np.percentile(seldict['Rhos'][group0], 75)]) == 3], group0), field_art) field_art = np.union1d(np.setdiff1d(nc[utils.andb([mmix_kurt_z_max > 5, Vz2 > 5]) == 2], group0), field_art) misc_art = np.setdiff1d(nc[utils.andb([(stats.rankdata(Vz) - stats.rankdata(Ktz)) > newcest / 2, seldict['Kappas'] < Khighelbowval]) == 2], group0) ign_cand = np.unique(list(field_art)+list(phys_art)+list(misc_art)) midkrej = np.union1d(midk, rej) to_ign = np.setdiff1d(list(ign_cand), midkrej) toacc = np.union1d(toacc_hi, toacc_lo) ncl = np.setdiff1d(np.union1d(ncl, toacc), np.union1d(to_ign, midkrej)) ign = np.setdiff1d(nc, list(ncl) + list(midk) + list(rej)) orphan = np.setdiff1d(nc, list(ncl) + list(to_ign) + list(midk) + list(rej)) # Last ditch effort to save some transient components if not strict_mode: Vz3 = (varex_ - varex_[ncl].mean())/varex_[ncl].std() ncl = np.union1d(ncl, np.intersect1d(orphan, nc[utils.andb([seldict['Kappas'] > F05, seldict['Rhos'] < F025, seldict['Kappas'] > seldict['Rhos'], Vz3 <= -1, Vz3 > -3, mmix_kurt_z_max < 2.5]) == 6])) ign = np.setdiff1d(nc, list(ncl)+list(midk)+list(rej)) orphan = np.setdiff1d(nc, list(ncl) + list(to_ign) + list(midk) + list(rej)) if savecsdiag: diagstep_keys = ['Rejected components', 'Kappa-Rho cut point', 'Kappa cut', 'Rho cut', 'DBSCAN failed to converge', 'Kappa-Rho guess', 'Dice rejected', 'rej_supp', 'to_clf', 'Mid-kappa components', 'svm_acc_fail', 'toacc_hi', 'toacc_lo', 'Field artifacts', 'Physiological artifacts', 'Miscellaneous artifacts', 'ncl', 'Ignored components'] diagstep_vals = [rej.tolist(), KRcut, Kcut, Rcut, dbscanfailed, KRguess.tolist(), dice_rej, rej_supp.tolist(), to_clf.tolist(), midk.tolist(), svm_acc_fail, toacc_hi.tolist(), toacc_lo.tolist(), field_art.tolist(), phys_art.tolist(), misc_art.tolist(), ncl.tolist(), ign.tolist()] with open('csstepdata.json', 'w') as ofh: json.dump(dict(zip(diagstep_keys, diagstep_vals)), ofh, indent=4, sort_keys=True) allfz = np.array([Tz, Vz, Ktz, KRr, cnz, Rz, mmix_kurt, fdist_z]) np.savetxt('csdata.txt', allfz) return list(sorted(ncl)), list(sorted(rej)), list(sorted(midk)), list(sorted(ign))
def tedpca(catd, OCcatd, combmode, mask, t2s, t2sG, stabilize, ref_img, tes, kdaw, rdaw, ste=0, mlepca=True, wvpca=False): """ Use principal components analysis (PCA) to identify and remove thermal noise from multi-echo data. Parameters ---------- catd : (S x E x T) array_like Input functional data OCcatd : (S x T) array_like Optimally-combined time series data combmode : {'t2s', 'ste'} str How optimal combination of echos should be made, where 't2s' indicates using the method of Posse 1999 and 'ste' indicates using the method of Poser 2006 mask : (S,) array_like Boolean mask array stabilize : :obj:`bool` Whether to attempt to stabilize convergence of ICA by returning dimensionally-reduced data from PCA and component selection. ref_img : :obj:`str` or img_like Reference image to dictate how outputs are saved to disk tes : :obj:`list` List of echo times associated with `catd`, in milliseconds kdaw : :obj:`float` Dimensionality augmentation weight for Kappa calculations rdaw : :obj:`float` Dimensionality augmentation weight for Rho calculations ste : :obj:`int` or :obj:`list` of :obj:`int`, optional Which echos to use in PCA. Values -1 and 0 are special, where a value of -1 will indicate using all the echos and 0 will indicate using the optimal combination of the echos. A list can be provided to indicate a subset of echos. Default: 0 mlepca : :obj:`bool`, optional Whether to use the method originally explained in Minka, NIPS 2000 for guessing PCA dimensionality instead of a traditional SVD. Default: True wvpca : :obj:`bool`, optional Whether to apply wavelet denoising to data. Default: False Returns ------- n_components : :obj:`int` Number of components retained from PCA decomposition dd : (S x E x T) :obj:`numpy.ndarray` Dimensionally-reduced functional data Notes ----- ====================== ================================================= Notation Meaning ====================== ================================================= :math:`\\kappa` Component pseudo-F statistic for TE-dependent (BOLD) model. :math:`\\rho` Component pseudo-F statistic for TE-independent (artifact) model. :math:`v` Voxel :math:`V` Total number of voxels in mask :math:`\\zeta` Something :math:`c` Component :math:`p` Something else ====================== ================================================= Steps: 1. Variance normalize either multi-echo or optimally combined data, depending on settings. 2. Decompose normalized data using PCA or SVD. 3. Compute :math:`{\\kappa}` and :math:`{\\rho}`: .. math:: {\\kappa}_c = \\frac{\sum_{v}^V {\\zeta}_{c,v}^p * \ F_{c,v,R_2^*}}{\sum {\\zeta}_{c,v}^p} {\\rho}_c = \\frac{\sum_{v}^V {\\zeta}_{c,v}^p * \ F_{c,v,S_0}}{\sum {\\zeta}_{c,v}^p} 4. Some other stuff. Something about elbows. 5. Classify components as thermal noise if they meet both of the following criteria: - Nonsignificant :math:`{\\kappa}` and :math:`{\\rho}`. - Nonsignificant variance explained. Outputs: This function writes out several files: ====================== ================================================= Filename Content ====================== ================================================= pcastate.pkl Values from PCA results. comp_table_pca.txt PCA component table. mepca_mix.1D PCA mixing matrix. ====================== ================================================= """ n_samp, n_echos, n_vols = catd.shape ste = np.array([int(ee) for ee in str(ste).split(',')]) if len(ste) == 1 and ste[0] == -1: LGR.info('Computing PCA of optimally combined multi-echo data') d = OCcatd[utils.make_min_mask(OCcatd[:, np.newaxis, :])][:, np.newaxis, :] elif len(ste) == 1 and ste[0] == 0: LGR.info('Computing PCA of spatially concatenated multi-echo data') d = catd[mask].astype('float64') else: LGR.info('Computing PCA of echo #%s' % ','.join([str(ee) for ee in ste])) d = np.stack([catd[mask, ee] for ee in ste - 1], axis=1).astype('float64') eim = np.squeeze(eimask(d)) d = np.squeeze(d[eim]) dz = ((d.T - d.T.mean(axis=0)) / d.T.std(axis=0)).T # var normalize ts dz = (dz - dz.mean()) / dz.std() # var normalize everything if wvpca: dz, cAl = dwtmat(dz) if not op.exists('pcastate.pkl'): # do PC dimension selection and get eigenvalue cutoff if mlepca: from sklearn.decomposition import PCA ppca = PCA(n_components='mle', svd_solver='full') ppca.fit(dz) v = ppca.components_ s = ppca.explained_variance_ u = np.dot(np.dot(dz, v.T), np.diag(1. / s)) else: u, s, v = np.linalg.svd(dz, full_matrices=0) # actual variance explained (normalized) sp = s / s.sum() eigelb = getelbow_mod(sp, return_val=True) spdif = np.abs(np.diff(sp)) spdifh = spdif[(len(spdif) // 2):] spdthr = np.mean([spdifh.max(), spdif.min()]) spmin = sp[(len(spdif) // 2) + np.arange(len(spdifh))[spdifh >= spdthr][0] + 1] spcum = np.cumsum(sp) # Compute K and Rho for PCA comps eimum = np.atleast_2d(eim) eimum = np.transpose(eimum, np.argsort(eimum.shape)[::-1]) eimum = eimum.prod(axis=1) o = np.zeros((mask.shape[0], *eimum.shape[1:])) o[mask] = eimum eimum = np.squeeze(o).astype(bool) vTmix = v.T vTmixN = ((vTmix.T - vTmix.T.mean(0)) / vTmix.T.std(0)).T LGR.info('Making initial component selection guess from PCA results') _, ctb, betasv, v_T = model.fitmodels_direct(catd, v.T, eimum, t2s, t2sG, tes, combmode, ref_img, mmixN=vTmixN, full_sel=False) ctb = ctb[ctb[:, 0].argsort(), :] ctb = np.vstack([ctb.T[:3], sp]).T # Save state fname = op.abspath('pcastate.pkl') LGR.info('Saving PCA results to: {}'.format(fname)) pcastate = { 'u': u, 's': s, 'v': v, 'ctb': ctb, 'eigelb': eigelb, 'spmin': spmin, 'spcum': spcum } try: with open(fname, 'wb') as handle: pickle.dump(pcastate, handle) except TypeError: LGR.warning('Could not save PCA solution') else: # if loading existing state LGR.info('Loading PCA from: pcastate.pkl') with open('pcastate.pkl', 'rb') as handle: pcastate = pickle.load(handle) u, s, v = pcastate['u'], pcastate['s'], pcastate['v'] ctb, eigelb = pcastate['ctb'], pcastate['eigelb'] spmin, spcum = pcastate['spmin'], pcastate['spcum'] np.savetxt('comp_table_pca.txt', ctb[ctb[:, 1].argsort(), :][::-1]) np.savetxt('mepca_mix.1D', v[ctb[:, 1].argsort()[::-1], :].T) kappas = ctb[ctb[:, 1].argsort(), 1] rhos = ctb[ctb[:, 2].argsort(), 2] fmin, fmid, fmax = utils.getfbounds(n_echos) kappa_thr = np.average(sorted( [fmin, getelbow_mod(kappas, return_val=True) / 2, fmid]), weights=[kdaw, 1, 1]) rho_thr = np.average(sorted( [fmin, getelbow_cons(rhos, return_val=True) / 2, fmid]), weights=[rdaw, 1, 1]) if int(kdaw) == -1: kappas_lim = kappas[utils.andb([kappas < fmid, kappas > fmin]) == 2] kappa_thr = kappas_lim[getelbow_mod(kappas_lim)] rhos_lim = rhos[utils.andb([rhos < fmid, rhos > fmin]) == 2] rho_thr = rhos_lim[getelbow_mod(rhos_lim)] stabilize = True if int(kdaw) != -1 and int(rdaw) == -1: rhos_lim = rhos[utils.andb([rhos < fmid, rhos > fmin]) == 2] rho_thr = rhos_lim[getelbow_mod(rhos_lim)] is_hik = np.array(ctb[:, 1] > kappa_thr, dtype=np.int) is_hir = np.array(ctb[:, 2] > rho_thr, dtype=np.int) is_hie = np.array(ctb[:, 3] > eigelb, dtype=np.int) is_his = np.array(ctb[:, 3] > spmin, dtype=np.int) is_not_fmax1 = np.array(ctb[:, 1] != F_MAX, dtype=np.int) is_not_fmax2 = np.array(ctb[:, 2] != F_MAX, dtype=np.int) pcscore = (is_hik + is_hir + is_hie) * is_his * is_not_fmax1 * is_not_fmax2 if stabilize: temp7 = np.array(spcum < 0.95, dtype=np.int) temp8 = np.array(ctb[:, 2] > fmin, dtype=np.int) temp9 = np.array(ctb[:, 1] > fmin, dtype=np.int) pcscore = pcscore * temp7 * temp8 * temp9 pcsel = pcscore > 0 dd = u.dot(np.diag(s * np.array(pcsel, dtype=np.int))).dot(v) if wvpca: dd = idwtmat(dd, cAl) n_components = s[pcsel].shape[0] LGR.info('Selected {0} components with Kappa threshold: {1:.02f}, ' 'Rho threshold: {2:.02f}'.format(n_components, kappa_thr, rho_thr)) dd = stats.zscore(dd.T, axis=0).T # variance normalize timeseries dd = stats.zscore(dd, axis=None) # variance normalize everything return n_components, dd
def fitmodels_direct(catd, mmix, mask, t2s, t2sG, tes, combmode, ref_img, fout=None, reindex=False, mmixN=None, full_sel=True): """ Fit models directly. Parameters ---------- catd : (S x E x T) array_like Input data, where `S` is samples, `E` is echos, and `T` is time mmix : (T x C) array_like Mixing matrix for converting input data to component space, where `C` is components and `T` is the same as in `catd` mask : (S,) array_like Boolean mask array t2s : (S,) array_like t2sG : (S,) array_like tes : list List of echo times associated with `catd`, in milliseconds combmode : {'t2s', 'ste'} str How optimal combination of echos should be made, where 't2s' indicates using the method of Posse 1999 and 'ste' indicates using the method of Poser 2006 ref_img : str or img_like Reference image to dictate how outputs are saved to disk fout : bool Whether to output per-component TE-dependence maps. Default: None reindex : bool, optional Default: False mmixN : array_like, optional Default: None full_sel : bool, optional Whether to perform selection of components based on Rho/Kappa scores. Default: True Returns ------- seldict : dict comptab : (N x 5) :obj:`numpy.ndarray` Array with columns denoting (1) index of component, (2) Kappa score of component, (3) Rho score of component, (4) variance explained by component, and (5) normalized variance explained bycomponent betas : :obj:`numpy.ndarray` mmix_new : :obj:`numpy.ndarray` """ # compute optimal combination of raw data tsoc = model.make_optcom(catd, t2sG, tes, mask, combmode, verbose=False).astype(float)[mask] # demean optimal combination tsoc_dm = tsoc - tsoc.mean(axis=-1, keepdims=True) # compute un-normalized weight dataset (features) if mmixN is None: mmixN = mmix WTS = computefeats2(utils.unmask(tsoc, mask), mmixN, mask, normalize=False) # compute PSC dataset - shouldn't have to refit data tsoc_B = get_coeffs(utils.unmask(tsoc_dm, mask), mask, mmix)[mask] tsoc_Babs = np.abs(tsoc_B) PSC = tsoc_B / tsoc.mean(axis=-1, keepdims=True) * 100 # compute skews to determine signs based on unnormalized weights, # correct mmix & WTS signs based on spatial distribution tails signs = stats.skew(WTS, axis=0) signs /= np.abs(signs) mmix = mmix.copy() mmix *= signs WTS *= signs PSC *= signs totvar = (tsoc_B**2).sum() totvar_norm = (WTS**2).sum() # compute Betas and means over TEs for TE-dependence analysis betas = get_coeffs(catd, np.repeat(mask[:, np.newaxis], len(tes), axis=1), mmix) n_samp, n_echos, n_components = betas.shape n_voxels = mask.sum() n_data_voxels = (t2s != 0).sum() mu = catd.mean(axis=-1, dtype=float) tes = np.reshape(tes, (n_echos, 1)) fmin, fmid, fmax = utils.getfbounds(n_echos) # mask arrays mumask = mu[t2s != 0] t2smask = t2s[t2s != 0] betamask = betas[t2s != 0] # set up Xmats X1 = mumask.T # Model 1 X2 = np.tile(tes, (1, n_data_voxels)) * mumask.T / t2smask.T # Model 2 # tables for component selection Kappas = np.zeros([n_components]) Rhos = np.zeros([n_components]) varex = np.zeros([n_components]) varex_norm = np.zeros([n_components]) Z_maps = np.zeros([n_voxels, n_components]) F_R2_maps = np.zeros([n_data_voxels, n_components]) F_S0_maps = np.zeros([n_data_voxels, n_components]) Z_clmaps = np.zeros([n_voxels, n_components]) F_R2_clmaps = np.zeros([n_data_voxels, n_components]) F_S0_clmaps = np.zeros([n_data_voxels, n_components]) Br_clmaps_R2 = np.zeros([n_voxels, n_components]) Br_clmaps_S0 = np.zeros([n_voxels, n_components]) LGR.info('Fitting TE- and S0-dependent models to components') for i in range(n_components): # size of B is (n_components, nx*ny*nz) B = np.atleast_3d(betamask)[:, :, i].T alpha = (np.abs(B)**2).sum(axis=0) varex[i] = (tsoc_B[:, i]**2).sum() / totvar * 100. varex_norm[i] = (utils.unmask(WTS, mask)[t2s != 0][:, i]**2).sum() / totvar_norm * 100. # S0 Model coeffs_S0 = (B * X1).sum(axis=0) / (X1**2).sum(axis=0) SSE_S0 = (B - X1 * np.tile(coeffs_S0, (n_echos, 1)))**2 SSE_S0 = SSE_S0.sum(axis=0) F_S0 = (alpha - SSE_S0) * 2 / (SSE_S0) F_S0_maps[:, i] = F_S0 # R2 Model coeffs_R2 = (B * X2).sum(axis=0) / (X2**2).sum(axis=0) SSE_R2 = (B - X2 * np.tile(coeffs_R2, (n_echos, 1)))**2 SSE_R2 = SSE_R2.sum(axis=0) F_R2 = (alpha - SSE_R2) * 2 / (SSE_R2) F_R2_maps[:, i] = F_R2 # compute weights as Z-values wtsZ = (WTS[:, i] - WTS[:, i].mean()) / WTS[:, i].std() wtsZ[np.abs(wtsZ) > Z_MAX] = (Z_MAX * (np.abs(wtsZ) / wtsZ))[np.abs(wtsZ) > Z_MAX] Z_maps[:, i] = wtsZ # compute Kappa and Rho F_S0[F_S0 > F_MAX] = F_MAX F_R2[F_R2 > F_MAX] = F_MAX norm_weights = np.abs(np.squeeze(utils.unmask(wtsZ, mask)[t2s != 0]**2.)) Kappas[i] = np.average(F_R2, weights=norm_weights) Rhos[i] = np.average(F_S0, weights=norm_weights) # tabulate component values comptab_pre = np.vstack([np.arange(n_components), Kappas, Rhos, varex, varex_norm]).T if reindex: # re-index all components in Kappa order comptab = comptab_pre[comptab_pre[:, 1].argsort()[::-1], :] Kappas = comptab[:, 1] Rhos = comptab[:, 2] varex = comptab[:, 3] varex_norm = comptab[:, 4] nnc = np.array(comptab[:, 0], dtype=np.int) mmix_new = mmix[:, nnc] F_S0_maps = F_S0_maps[:, nnc] F_R2_maps = F_R2_maps[:, nnc] Z_maps = Z_maps[:, nnc] WTS = WTS[:, nnc] PSC = PSC[:, nnc] tsoc_B = tsoc_B[:, nnc] tsoc_Babs = tsoc_Babs[:, nnc] comptab[:, 0] = np.arange(comptab.shape[0]) else: comptab = comptab_pre mmix_new = mmix # full selection including clustering criteria seldict = None if full_sel: LGR.info('Performing spatial clustering of components') csize = np.max([int(n_voxels * 0.0005) + 5, 20]) LGR.debug('Using minimum cluster size: {}'.format(csize)) for i in range(n_components): # save out files out = np.zeros((n_samp, 4)) out[:, 0] = np.squeeze(utils.unmask(PSC[:, i], mask)) out[:, 1] = np.squeeze(utils.unmask(F_R2_maps[:, i], t2s != 0)) out[:, 2] = np.squeeze(utils.unmask(F_S0_maps[:, i], t2s != 0)) out[:, 3] = np.squeeze(utils.unmask(Z_maps[:, i], mask)) if utils.get_dtype(ref_img) == 'GIFTI': continue # TODO: pass through GIFTI file data as below ccimg = utils.new_nii_like(ref_img, out) # Do simple clustering on F sel = spatclust(ccimg, min_cluster_size=csize, threshold=int(fmin), index=[1, 2], mask=(t2s != 0)) F_R2_clmaps[:, i] = sel[:, 0] F_S0_clmaps[:, i] = sel[:, 1] countsigFR2 = F_R2_clmaps[:, i].sum() countsigFS0 = F_S0_clmaps[:, i].sum() # Do simple clustering on Z at p<0.05 sel = spatclust(ccimg, min_cluster_size=csize, threshold=1.95, index=3, mask=mask) Z_clmaps[:, i] = sel # Do simple clustering on ranked signal-change map spclust_input = utils.unmask(stats.rankdata(tsoc_Babs[:, i]), mask) spclust_input = utils.new_nii_like(ref_img, spclust_input) Br_clmaps_R2[:, i] = spatclust(spclust_input, min_cluster_size=csize, threshold=max(tsoc_Babs.shape)-countsigFR2, mask=mask) Br_clmaps_S0[:, i] = spatclust(spclust_input, min_cluster_size=csize, threshold=max(tsoc_Babs.shape)-countsigFS0, mask=mask) seldict = {} selvars = ['Kappas', 'Rhos', 'WTS', 'varex', 'Z_maps', 'F_R2_maps', 'F_S0_maps', 'Z_clmaps', 'F_R2_clmaps', 'F_S0_clmaps', 'tsoc_B', 'Br_clmaps_R2', 'Br_clmaps_S0', 'PSC'] for vv in selvars: seldict[vv] = eval(vv) return seldict, comptab, betas, mmix_new
def selcomps(seldict, comptable, mmix, manacc, n_echos): """ Classify components in seldict as "accepted," "rejected," or "ignored." The selection process uses previously calculated parameters listed in `seldict` for each ICA component such as Kappa (a T2* weighting metric), Rho (an S0 weighting metric), and variance explained. See `Notes` for additional calculated metrics used to classify each component into one of the four listed groups. Parameters ---------- seldict : :obj:`dict` A dictionary with component-specific features used for classification. As output from `fitmodels_direct` comptable : (C x X) :obj:`pandas.DataFrame` Component metric table. One row for each component, with a column for each metric. The index should be the component number. mmix : (T x C) array_like Mixing matrix for converting input data to component space, where `C` is components and `T` is the number of volumes in the original data manacc : :obj:`list` Comma-separated list of indices of manually accepted components n_echos : :obj:`int` Number of echos in original data Returns ------- comptable : :obj:`pandas.DataFrame` Updated component table with additional metrics and with classification (accepted, rejected, or ignored) Notes ----- The selection algorithm used in this function was originated in ME-ICA by Prantik Kundu, and his original implementation is available at: https://github.com/ME-ICA/me-ica/blob/b2781dd087ab9de99a2ec3925f04f02ce84f0adc/meica.libs/select_model.py This component selection process uses multiple, previously calculated metrics that include: kappa, rho, variance explained, component spatial weighting maps, noise and spatial frequency metrics, and measures of spatial overlap across metrics. Prantik began to update these selection criteria to use SVMs to distinguish components, a hypercommented version of this attempt is available at: https://gist.github.com/emdupre/ca92d52d345d08ee85e104093b81482e """ cols_at_end = ['classification', 'rationale'] # Lists of components all_comps = np.arange(comptable.shape[0]) # unclf is a full list that is whittled down over criteria # since the default classification is "accepted", at the end of the tree # the remaining elements in unclf are classified as accepted unclf = all_comps.copy() # If user has specified if manacc: LGR.info('Performing manual ICA component selection') if ('classification' in comptable.columns and 'original_classification' not in comptable.columns): comptable['original_classification'] = comptable['classification'] comptable['original_rationale'] = comptable['rationale'] comptable['classification'] = 'accepted' comptable['rationale'] = '' acc = [int(comp) for comp in manacc] rej = sorted(np.setdiff1d(all_comps, acc)) comptable.loc[acc, 'classification'] = 'accepted' comptable.loc[rej, 'classification'] = 'rejected' comptable.loc[rej, 'rationale'] += 'I001;' # Move decision columns to end comptable = comptable[[c for c in comptable if c not in cols_at_end] + [c for c in cols_at_end if c in comptable]] comptable['rationale'] = comptable['rationale'].str.rstrip(';') return comptable comptable['classification'] = 'accepted' comptable['rationale'] = '' Z_maps = seldict['Z_maps'] Z_clmaps = seldict['Z_clmaps'] F_R2_maps = seldict['F_R2_maps'] F_S0_clmaps = seldict['F_S0_clmaps'] F_R2_clmaps = seldict['F_R2_clmaps'] Br_S0_clmaps = seldict['Br_S0_clmaps'] Br_R2_clmaps = seldict['Br_R2_clmaps'] # Set knobs n_vols, n_comps = mmix.shape LOW_PERC = 25 HIGH_PERC = 90 if n_vols < 100: EXTEND_FACTOR = 3 else: EXTEND_FACTOR = 2 RESTRICT_FACTOR = 2 """ Tally number of significant voxels for cluster-extent thresholded R2 and S0 model F-statistic maps. """ comptable['countsigFR2'] = F_R2_clmaps.sum(axis=0) comptable['countsigFS0'] = F_S0_clmaps.sum(axis=0) """ Generate Dice values for R2 and S0 models - dice_FR2: Dice value of cluster-extent thresholded maps of R2-model betas and F-statistics. - dice_FS0: Dice value of cluster-extent thresholded maps of S0-model betas and F-statistics. """ comptable['dice_FR2'] = np.zeros(all_comps.shape[0]) comptable['dice_FS0'] = np.zeros(all_comps.shape[0]) for i_comp in all_comps: comptable.loc[i_comp, 'dice_FR2'] = utils.dice(Br_R2_clmaps[:, i_comp], F_R2_clmaps[:, i_comp]) comptable.loc[i_comp, 'dice_FS0'] = utils.dice(Br_S0_clmaps[:, i_comp], F_S0_clmaps[:, i_comp]) comptable.loc[np.isnan(comptable['dice_FR2']), 'dice_FR2'] = 0 comptable.loc[np.isnan(comptable['dice_FS0']), 'dice_FS0'] = 0 """ Generate three metrics of component noise: - countnoise: Number of "noise" voxels (voxels highly weighted for component, but not from clusters) - signal-noise_t: T-statistic for two-sample t-test of F-statistics from "signal" voxels (voxels in clusters) against "noise" voxels (voxels not in clusters) for R2 model. - signal-noise_p: P-value from t-test. """ comptable['countnoise'] = 0 comptable['signal-noise_t'] = 0 comptable['signal-noise_p'] = 0 for i_comp in all_comps: # index voxels significantly loading on component but not from clusters comp_noise_sel = ((np.abs(Z_maps[:, i_comp]) > 1.95) & (Z_clmaps[:, i_comp] == 0)) comptable.loc[i_comp, 'countnoise'] = np.array( comp_noise_sel, dtype=np.int).sum() # NOTE: Why only compare distributions of *unique* F-statistics? noise_FR2_Z = np.log10(np.unique(F_R2_maps[comp_noise_sel, i_comp])) signal_FR2_Z = np.log10(np.unique( F_R2_maps[Z_clmaps[:, i_comp] == 1, i_comp])) (comptable.loc[i_comp, 'signal-noise_t'], comptable.loc[i_comp, 'signal-noise_p']) = stats.ttest_ind( signal_FR2_Z, noise_FR2_Z, equal_var=False) comptable.loc[np.isnan(comptable['signal-noise_t']), 'signal-noise_t'] = 0 comptable.loc[np.isnan(comptable['signal-noise_p']), 'signal-noise_p'] = 0 """ Assemble decision table with five metrics: - Kappa values ranked from largest to smallest - R2-model F-score map/beta map Dice scores ranked from largest to smallest - Signal F > Noise F t-statistics ranked from largest to smallest - Number of "noise" voxels (voxels highly weighted for component, but not from clusters) ranked from smallest to largest - Number of voxels with significant R2-model F-scores within clusters ranked from largest to smallest Smaller values (i.e., higher ranks) across metrics indicate more BOLD dependence and less noise. """ d_table_rank = np.vstack([ n_comps - stats.rankdata(comptable['kappa']), n_comps - stats.rankdata(comptable['dice_FR2']), n_comps - stats.rankdata(comptable['signal-noise_t']), stats.rankdata(comptable['countnoise']), n_comps - stats.rankdata(comptable['countsigFR2'])]).T comptable['d_table_score'] = d_table_rank.mean(axis=1) """ Step 1: Reject anything that's obviously an artifact a. Estimate a null variance """ # Rho is higher than Kappa temp_rej0a = all_comps[(comptable['rho'] > comptable['kappa'])] comptable.loc[temp_rej0a, 'classification'] = 'rejected' comptable.loc[temp_rej0a, 'rationale'] += 'I002;' # Number of significant voxels for S0 model is higher than number for R2 # model *and* number for R2 model is greater than zero. temp_rej0b = all_comps[((comptable['countsigFS0'] > comptable['countsigFR2']) & (comptable['countsigFR2'] > 0))] comptable.loc[temp_rej0b, 'classification'] = 'rejected' comptable.loc[temp_rej0b, 'rationale'] += 'I003;' rej = np.union1d(temp_rej0a, temp_rej0b) # Dice score for S0 maps is higher than Dice score for R2 maps and variance # explained is higher than the median across components. temp_rej1 = all_comps[(comptable['dice_FS0'] > comptable['dice_FR2']) & (comptable['variance explained'] > np.median(comptable['variance explained']))] comptable.loc[temp_rej1, 'classification'] = 'rejected' comptable.loc[temp_rej1, 'rationale'] += 'I004;' rej = np.union1d(temp_rej1, rej) # T-value is less than zero (noise has higher F-statistics than signal in # map) and variance explained is higher than the median across components. temp_rej2 = unclf[(comptable.loc[unclf, 'signal-noise_t'] < 0) & (comptable.loc[unclf, 'variance explained'] > np.median(comptable['variance explained']))] comptable.loc[temp_rej2, 'classification'] = 'rejected' comptable.loc[temp_rej2, 'rationale'] += 'I005;' rej = np.union1d(temp_rej2, rej) unclf = np.setdiff1d(unclf, rej) """ Step 2: Make a guess for what the good components are, in order to estimate good component properties a. Not outlier variance b. Kappa>kappa_elbow c. Rho<Rho_elbow d. High R2* dice compared to S0 dice e. Gain of F_R2 in clusters vs noise f. Estimate a low and high variance """ # Step 2a # Upper limit for variance explained is median across components with high # Kappa values. High Kappa is defined as Kappa above Kappa elbow. varex_upper_p = np.median( comptable.loc[comptable['kappa'] > getelbow(comptable['kappa'], return_val=True), 'variance explained']) ncls = unclf.copy() # NOTE: We're not sure why this is done, nor why it's specifically done # three times. Need to look into this deeper, esp. to make sure the 3 # isn't a hard-coded reference to the number of echoes. # Reduce components to investigate as "good" to ones in which change in # variance explained is less than the limit defined above.... What? for i_loop in range(3): ncls = comptable.loc[ncls].loc[ comptable.loc[ ncls, 'variance explained'].diff() < varex_upper_p].index.values # Compute elbows from other elbows f05, _, f01 = utils.getfbounds(n_echos) kappas_nonsig = comptable.loc[comptable['kappa'] < f01, 'kappa'] # NOTE: Would an elbow from all Kappa values *ever* be lower than one from # a subset of lower values? kappa_elbow = np.min((getelbow(kappas_nonsig, return_val=True), getelbow(comptable['kappa'], return_val=True))) rho_elbow = np.mean((getelbow(comptable.loc[ncls, 'rho'], return_val=True), getelbow(comptable['rho'], return_val=True), f05)) # Provisionally accept components based on Kappa and Rho elbows acc_prov = ncls[(comptable.loc[ncls, 'kappa'] >= kappa_elbow) & (comptable.loc[ncls, 'rho'] < rho_elbow)] if len(acc_prov) == 0: LGR.warning('No BOLD-like components detected') ign = sorted(np.setdiff1d(all_comps, rej)) comptable.loc[ign, 'classification'] = 'ignored' comptable.loc[ign, 'rationale'] += 'I006;' # Move decision columns to end comptable = comptable[[c for c in comptable if c not in cols_at_end] + [c for c in cols_at_end if c in comptable]] comptable['rationale'] = comptable['rationale'].str.rstrip(';') return comptable # Calculate "rate" for kappa: kappa range divided by variance explained # range, for potentially accepted components # NOTE: What is the logic behind this? kappa_rate = ((np.max(comptable.loc[acc_prov, 'kappa']) - np.min(comptable.loc[acc_prov, 'kappa'])) / (np.max(comptable.loc[acc_prov, 'variance explained']) - np.min(comptable.loc[acc_prov, 'variance explained']))) comptable['kappa ratio'] = kappa_rate * comptable['variance explained'] / comptable['kappa'] varex_lower = stats.scoreatpercentile( comptable.loc[acc_prov, 'variance explained'], LOW_PERC) varex_upper = stats.scoreatpercentile( comptable.loc[acc_prov, 'variance explained'], HIGH_PERC) """ Step 3: Get rid of midk components; i.e., those with higher than max decision score and high variance """ max_good_d_score = EXTEND_FACTOR * len(acc_prov) midk = unclf[(comptable.loc[unclf, 'd_table_score'] > max_good_d_score) & (comptable.loc[unclf, 'variance explained'] > EXTEND_FACTOR * varex_upper)] comptable.loc[midk, 'classification'] = 'rejected' comptable.loc[midk, 'rationale'] += 'I007;' unclf = np.setdiff1d(unclf, midk) acc_prov = np.setdiff1d(acc_prov, midk) """ Step 4: Find components to ignore """ # collect high variance unclassified components # and mix of high/low provisionally accepted high_varex = np.union1d( acc_prov, unclf[comptable.loc[unclf, 'variance explained'] > varex_lower]) # ignore low variance components ign = np.setdiff1d(unclf, high_varex) # but only if they have bad decision scores ign = np.setdiff1d( ign, ign[comptable.loc[ign, 'd_table_score'] < max_good_d_score]) # and low kappa ign = np.setdiff1d(ign, ign[comptable.loc[ign, 'kappa'] > kappa_elbow]) comptable.loc[ign, 'classification'] = 'ignored' comptable.loc[ign, 'rationale'] += 'I008;' unclf = np.setdiff1d(unclf, ign) """ Step 5: Scrub the set if there are components that haven't been rejected or ignored, but are still not listed in the provisionally accepted group. """ if len(unclf) > len(acc_prov): comptable['d_table_score_scrub'] = np.nan # Recompute the midk steps on the limited set to clean up the tail d_table_rank = np.vstack([ len(unclf) - stats.rankdata(comptable.loc[unclf, 'kappa']), len(unclf) - stats.rankdata(comptable.loc[unclf, 'dice_FR2']), len(unclf) - stats.rankdata(comptable.loc[unclf, 'signal-noise_t']), stats.rankdata(comptable.loc[unclf, 'countnoise']), len(unclf) - stats.rankdata(comptable.loc[unclf, 'countsigFR2'])]).T comptable.loc[unclf, 'd_table_score_scrub'] = d_table_rank.mean(1) num_acc_guess = int(np.mean([ np.sum((comptable.loc[unclf, 'kappa'] > kappa_elbow) & (comptable.loc[unclf, 'rho'] < rho_elbow)), np.sum(comptable.loc[unclf, 'kappa'] > kappa_elbow)])) # Rejection candidate based on artifact type A: candartA conservative_guess = num_acc_guess / RESTRICT_FACTOR candartA = np.intersect1d( unclf[comptable.loc[unclf, 'd_table_score_scrub'] > conservative_guess], unclf[comptable.loc[unclf, 'kappa ratio'] > EXTEND_FACTOR * 2]) candartA = (candartA[comptable.loc[candartA, 'variance explained'] > varex_upper * EXTEND_FACTOR]) comptable.loc[candartA, 'classification'] = 'rejected' comptable.loc[candartA, 'rationale'] += 'I009;' midk = np.union1d(midk, candartA) unclf = np.setdiff1d(unclf, midk) # Rejection candidate based on artifact type B: candartB conservative_guess2 = num_acc_guess * HIGH_PERC / 100. candartB = unclf[comptable.loc[unclf, 'd_table_score_scrub'] > conservative_guess2] candartB = (candartB[comptable.loc[candartB, 'variance explained'] > varex_lower * EXTEND_FACTOR]) comptable.loc[candartB, 'classification'] = 'rejected' comptable.loc[candartB, 'rationale'] += 'I010;' midk = np.union1d(midk, candartB) unclf = np.setdiff1d(unclf, midk) # Find components to ignore # Ignore high variance explained, poor decision tree scored components new_varex_lower = stats.scoreatpercentile( comptable.loc[unclf[:num_acc_guess], 'variance explained'], LOW_PERC) candart = unclf[comptable.loc[unclf, 'd_table_score_scrub'] > num_acc_guess] ign_add0 = candart[comptable.loc[candart, 'variance explained'] > new_varex_lower] ign_add0 = np.setdiff1d(ign_add0, midk) comptable.loc[ign_add0, 'classification'] = 'ignored' comptable.loc[ign_add0, 'rationale'] += 'I011;' ign = np.union1d(ign, ign_add0) unclf = np.setdiff1d(unclf, ign) # Ignore low Kappa, high variance explained components ign_add1 = np.intersect1d( unclf[comptable.loc[unclf, 'kappa'] <= kappa_elbow], unclf[comptable.loc[unclf, 'variance explained'] > new_varex_lower]) ign_add1 = np.setdiff1d(ign_add1, midk) comptable.loc[ign_add1, 'classification'] = 'ignored' comptable.loc[ign_add1, 'rationale'] += 'I012;' # at this point, unclf is equivalent to accepted # Move decision columns to end comptable = comptable[[c for c in comptable if c not in cols_at_end] + [c for c in cols_at_end if c in comptable]] comptable['rationale'] = comptable['rationale'].str.rstrip(';') return comptable