def run_constrained_clustering_by_fov_experiment( savepath, gc, whoistruth, evalset='U-control'): """Get medoids and vis constraining on a couple of fovs.""" # connect to sqlite database -- anchors dbcon = { y: _connect_to_anchor_db(opj(savepath, '..'), constrained=y) for y in [True, False] } savedir = opj(savepath, f"{evalset}_{whoistruth}_AreTruth") _maybe_mkdir(savedir) # get fovs where there's a difference anchor_counts = _get_fov_anchor_counts( dbcon=dbcon, evalset=evalset, whoistruth=whoistruth) fovs_to_vis = list(anchor_counts.loc[anchor_counts.loc[ :, f'diff_mean_n_matches_{whoistruth}'] > 0, :].index) for fovname in fovs_to_vis: print(f"visualizing {fovname}") # plot effect of clustering constraint plot_effect_of_iouthresh_and_constraint( gc=gc, dbcon=dbcon, fovname=fovname, whoistruth=whoistruth, evalset=evalset, savename=opj(savedir, f'constraintEffect_{fovname}.png'), )
def plot_krippendorph_summary(savepath, clsgroup): """""" # connect to database dbcon = _connect_to_anchor_db(opj(savepath, '..')) # get krippendorph summary table krippendorph_summary = read_sql_query( f""" SELECT * FROM "Krippendorph_byAnchorSubsets" WHERE "class_grouping" = "{clsgroup}" ;""", dbcon) # now plot savedir = opj(savepath, '..', 'i10_Krippendorph', f'plots_{clsgroup}') _maybe_mkdir(savedir) _ = [ plot_krippendorph_figure( savedir=savedir, krippendorph_summary=krippendorph_summary, unbiased_is_truth=unbiased_is_truth, evalset=evalset, whoistruth=whoistruth, who=who, whichanchors=whichanchors, ) for evalset in ir.MAIN_EVALSET_NAMES for unbiased_is_truth in [True, False] for whoistruth in ir.CONSENSUS_WHOS for who in ir.CONSENSUS_WHOS for whichanchors in ['v2.1_consensus', 'v2.2_excluded'] ]
def main(): DATASETNAME = 'CURATED_v1_2020-03-29_EVAL' # where to save stuff BASEPATH = "/home/mtageld/Desktop/cTME/results/tcga-nucleus/interrater/" SAVEPATH = opj(BASEPATH, DATASETNAME, 'i1_anchors') # connect to database dbcon = _connect_to_anchor_db(opj(SAVEPATH, '..')) for clsgroup in ['main', 'super']: delete_existing = True for unbiased_is_truth in [False]: for whoistruth in ['Ps']: # Interrater.CONSENSUS_WHOS: common_params = { 'dbcon': dbcon, 'whoistruth': whoistruth, 'unbiased_is_truth': unbiased_is_truth, 'clsgroup': clsgroup, 'delete_existing': delete_existing, } delete_existing = False # get confusion matrices by pathologist get_confusion_by_pathologist(**common_params) # get confusion matrices by anchor get_confusion_by_anchor(**common_params) # get accuracy stats get_pathologist_accuracy_stats(**common_params)
def main(): DATASETNAME = 'CURATED_v1_2020-03-29_EVAL' # where to save stuff BASEPATH = "/home/mtageld/Desktop/cTME/results/tcga-nucleus/interrater/" savedir = opj(BASEPATH, DATASETNAME, 'i9_InterRaterStats') _maybe_mkdir(savedir) # connect to sqlite database -- anchors dbcon = _connect_to_anchor_db(opj(savedir, '..')) # plot kappa matrix and MDS plot for clg in ['main', 'super']: where = opj(savedir, clg) _maybe_mkdir(where) # compare various evalsets in terms of inter-rater concordance plot_interrater_boxplots(dbcon=dbcon, where=where, clsgroup=clg) for evalset in ir.MAIN_EVALSET_NAMES: plot_interrater_pairs(dbcon=dbcon, where=where, evalset=evalset, clsgroup=clg)
def save_krippendorph_summary(savepath, clsgroup): """""" assert clsgroup in ['raw', 'main', 'super'] # connect to database dbcon = _connect_to_anchor_db(opj(savepath, '..')) # get and save krippendorph summary table for evalset in ['E', 'U-control']: # unbiased Ps, by definition, is U-control ubt = [False] if evalset != 'U-control': ubt.append(True) for unbiased_is_truth in ubt: for whoistruth in Interrater.CONSENSUS_WHOS: # unbiased NPs is not an interesting question if unbiased_is_truth and whoistruth == 'NPs': continue for who in Interrater.CONSENSUS_WHOS: # mixing Ps and NPs is meaningless if who == 'All': continue # Ps compared to "truth" from NPs is not meaningful if whoistruth == 'NPs' and who == 'Ps': continue # We only care about excluded anchors for main classes # and for when Ps are truth, just to demonstrate that # exclusion gets rid of bogus anchors anchor_types = ['v2.1_consensus'] if (clsgroup == 'main') \ and (not unbiased_is_truth) \ and (whoistruth == 'Ps'): anchor_types.append('v2.2_excluded') for whichanchors in anchor_types: summary = _get_krippendorph_summary_by_detection_ease( dbcon, clsgroup=clsgroup, unbiased_is_truth=unbiased_is_truth, evalset=evalset, whoistruth=whoistruth, who=who, whichanchors=whichanchors) summary.to_sql(name=f'Krippendorph_byAnchorSubsets', con=dbcon, if_exists='append', index=False)
def main(): DATASETNAME = 'CURATED_v1_2020-03-29_EVAL' # where to save stuff BASEPATH = "/home/mtageld/Desktop/cTME/results/tcga-nucleus/interrater/" SAVEDIR = opj(BASEPATH, DATASETNAME, 'i5_ParticipantAccuracy') _maybe_mkdir(SAVEDIR) # connect to sqlite database -- anchors dbcon = _connect_to_anchor_db(opj(SAVEDIR, '..')) # Go through various evaluation sets & participant groups for clsgroup in ['main', 'super']: savedir = opj(SAVEDIR, clsgroup) _maybe_mkdir(savedir) for whoistruth in ['Ps']: for unbiased_is_truth in [False]: ubstr = "UNBIASED_" if unbiased_is_truth else "" print(f'{clsgroup.upper()}: {ubstr}{whoistruth}_AreTruth') for evalset in ir.MAIN_EVALSET_NAMES: # accuracy stats for a single avalset plot_participant_accuracy_stats( dbcon=dbcon, savedir=savedir, unbiased_is_truth=unbiased_is_truth, whoistruth=whoistruth, evalset=evalset, clsgroup=clsgroup, ) # compare accuracy stats for various evalsets plot_participant_accuracy_stats_v2( dbcon=dbcon, savedir=savedir, unbiased_is_truth=unbiased_is_truth, whoistruth=whoistruth, clsgroup=clsgroup, ) # superimpose AUROC for various evalsets if whoistruth == 'Ps': plot_participant_accuracy_stats_v3( dbcon=dbcon, savedir=savedir, unbiased_is_truth=unbiased_is_truth, whoistruth=whoistruth, clsgroup=clsgroup, )
def main(): DATASETNAME = 'CURATED_v1_2020-03-29_EVAL' # where to save stuff BASEPATH = "/home/mtageld/Desktop/cTME/results/tcga-nucleus/interrater/" SAVEDIR = opj(BASEPATH, DATASETNAME, 'i8_IntraRaterStats') _maybe_mkdir(SAVEDIR) # connect to sqlite database -- anchors dbcon = _connect_to_anchor_db(opj(SAVEDIR, '..')) # compare same participant on various evalsets for clg in ['main', 'super']: plot_intrarater_stats( dbcon=dbcon, savedir=opj(SAVEDIR, clg), clsgroup=clg)
def main(): DATASETNAME = 'CURATED_v1_2020-03-29_EVAL' # where to save stuff BASEPATH = "/home/mtageld/Desktop/cTME/results/tcga-nucleus/interrater/" SAVEPATH = opj(BASEPATH, DATASETNAME, 'i1_anchors') # connect to database dbcon = _connect_to_anchor_db(opj(SAVEPATH, '..')) # run the experiement for various evalsets simulations( dbcon=dbcon, nsims=1000, min_ps_per_fov=16, max_sim_ps_per_fov=15, )
def main(): DATASETNAME = 'CURATED_v1_2020-03-29_EVAL' # where to save stuff BASEPATH = "/home/mtageld/Desktop/cTME/results/tcga-nucleus/interrater/" savedir = opj(BASEPATH, DATASETNAME, 'i6_SegmentationAccuracy') _maybe_mkdir(savedir) # connect to sqlite database -- anchors dbcon = _connect_to_anchor_db(opj(savedir, '..')) # Go through various evaluation sets & participant groups unbiased_is_truth = False for whoistruth in ir.CONSENSUS_WHOS: ubstr = "UNBIASED_" if unbiased_is_truth else "" print(f'{ubstr}{whoistruth}_AreTruth') # plot proportion of anchors that were agreed upon (by Ps) as # correctly segmented by the algorithm. # NOTE: Since the anchors here are paired, and the legend shows the # no of FOVs per anchor, this by definition uses the unbiased control # as a reference. plot_proportion_segmented(dbcon=dbcon, savedir=savedir, whoistruth=whoistruth) # compare accuracy stats for evalsets (coupled) plot_segmentation_accuracy_stats_v1( dbcon=dbcon, savedir=savedir, unbiased_is_truth=unbiased_is_truth, whoistruth=whoistruth) # compare accuracy stats for evalsets (independent) plot_segmentation_accuracy_stats_v2( dbcon=dbcon, savedir=savedir, unbiased_is_truth=unbiased_is_truth, whoistruth=whoistruth)
def main(): DATASETNAME = 'CURATED_v1_2020-03-29_EVAL' # where to save stuff BASEPATH = "/home/mtageld/Desktop/cTME/results/tcga-nucleus/interrater/" SAVEDIR = opj(BASEPATH, DATASETNAME, 'i11_NPsAccuracySimulations') _maybe_mkdir(SAVEDIR) # connect to sqlite database -- anchors dbcon = _connect_to_anchor_db(opj(SAVEDIR, '..')) # Go through various evaluation sets & participant groups for evalset in ['E']: for clsgroup in ['super']: savedir = opj(SAVEDIR, clsgroup) _maybe_mkdir(savedir) plot_simulation_stats( dbcon=dbcon, savedir=savedir, evalset=evalset, clsgroup=clsgroup)
def main(): DATASETNAME = 'CURATED_v1_2020-03-29_EVAL' # where to save stuff BASEPATH = "/home/mtageld/Desktop/cTME/results/tcga-nucleus/interrater/" SAVEDIR = opj(BASEPATH, DATASETNAME, 'i7_ParicipantConfusions') _maybe_mkdir(SAVEDIR) # connect to sqlite database -- anchors dbcon = _connect_to_anchor_db(opj(SAVEDIR, '..')) # Go through various evaluation sets & participant groups for clsgroup in ['main', 'super']: savedir = opj(SAVEDIR, clsgroup) _maybe_mkdir(savedir) for whoistruth in ['Ps']: # ir.CONSENSUS_WHOS: for unbiased_is_truth in [False]: # [True, False] for who in ir.CONSENSUS_WHOS: if (whoistruth == 'NPs') and (who == 'Ps'): continue for evalset in ['E', 'U-control']: # ir.MAIN_EVALSET_NAMES ubstr = "UNBIASED_" if unbiased_is_truth else "" print( f'{clsgroup.upper()}: ' f'{ubstr}{whoistruth}_AreTruth: {who}: {evalset}') # compare accuracy stats for various evalsets plot_participant_confusions( dbcon=dbcon, savedir=savedir, unbiased_is_truth=unbiased_is_truth, whoistruth=whoistruth, who=who, evalset=evalset, clsgroup=clsgroup, )
def get_and_plot_detection_and_classification_tally( savedir: str, unbiased_is_truth: bool, whoistruth: str, who: str, evalset: str): """Get a tally of detection and classification. For example, a tally dataframe for tumor nuclei, having a value of 43 at row 3, column 5 means that there are 43 tumor nuclei (i.e. their REAL label is 'tumor') that were detected by 5 people, but only 3 of these people called it 'tumor'. """ truthstr = f'{"UNBIASED_" if unbiased_is_truth else ""}{whoistruth}_AreTruth' # noqa where = opj(savedir, truthstr) _maybe_mkdir(where) _maybe_mkdir(opj(where, 'csv')) _maybe_mkdir(opj(where, 'plots')) # connect to sqlite database -- anchors dbcon_anchors = _connect_to_anchor_db(opj(savedir, '..')) # get combined tally of detection and classification tallydfs = _get_detection_and_classification_tally( dbcon_anchors=dbcon_anchors, unbiased_is_truth=unbiased_is_truth, whoistruth=whoistruth, evalset=evalset, who=who) # save csvs prepend = f'{Interrater.TRUTHMETHOD}_{evalset}_{who}_{truthstr}' for cls, tallydf in tallydfs.items(): tallydf.to_csv(opj( where, 'csv', f'{prepend}_{cls}_detection_and_classification_tally.csv'), ) # now plot vis_detection_and_classification_tally( tallydfs=tallydfs, savename=opj( where, 'plots', f'{prepend}_detection_and_classification_tally.svg'), )
def main(): DATASETNAME = 'CURATED_v1_2020-03-29_EVAL' # where to save stuff BASEPATH = "/home/mtageld/Desktop/cTME/results/tcga-nucleus/interrater/" savedir = opj(BASEPATH, DATASETNAME, 'i1_anchors', 'DATASET') # savedir = opj(BASEPATH, DATASETNAME, 'i1_anchors', 'TMP') _maybe_mkdir(savedir) # connect to sqlite database -- anchors dbcon = _connect_to_anchor_db(opj(savedir, '..', '..')) # to get FOV RGBs and visualize cluster medoids etc gc = CandygramAPI.connect_to_candygram() # Create datasets using different inferred truths for whoistruth in ir.CONSENSUS_WHOS: for evalset in ['E', 'U-control']: parse_anchors_dataset(dbcon=dbcon, gc=gc, savedir=savedir, whoistruth=whoistruth, evalset=evalset)
def main(): # Where are the masks, contours, etc DATASETNAME = 'CURATED_v1_2020-03-29_EVAL' DATASETPATH = "/home/mtageld/Desktop/cTME/data/tcga-nucleus/" DATASETPATH = opj(DATASETPATH, DATASETNAME) # where to save stuff SAVEPATH = "/home/mtageld/Desktop/cTME/results/tcga-nucleus/interrater/" SAVEPATH = opj(SAVEPATH, DATASETNAME) _maybe_mkdir(SAVEPATH) _maybe_mkdir(opj(SAVEPATH, 'i1_anchors')) # get + save everyone's alias alias = ir.PARTICIPANT_ALIASES aliasdf = DataFrame.from_dict(alias, orient='index') aliasdf.to_csv(opj(SAVEPATH, 'i1_anchors', 'participant_aliases.csv')) # connect to sqlite database -- annotations db_path = opj(DATASETPATH, DATASETNAME + ".sqlite") sql_engine = create_engine('sqlite:///' + db_path, echo=False) dbcon_annots = sql_engine.connect() # to get FOV RGBs and visualize cluster medoids etc gc = CandygramAPI.connect_to_candygram() MPP = 0.2 MAG = None # get information per evaluation set, user, and fov fovinfos = get_fovinfos_for_interrater(dbcon=dbcon_annots) with open(opj(SAVEPATH, 'i1_anchors', "fovinfos.json"), 'w') as f: json.dump(fovinfos, f, indent=4) # ------------------------------------------------------------------------- for constrained in [True, False]: # connect to sqlite database -- anchors dbcon = _connect_to_anchor_db(SAVEPATH, constrained=constrained) # Get nucleus anchors, using pathologists (SP/JP) as truth # but also get the false anchors gana_kwargs = { 'fovinfos': fovinfos, 'get_medoids_kwargs': { 'dbcon': dbcon_annots, # annotations 'who': 'All', 'add_relative_bounds': True, 'gc': gc, 'MPP': MPP, 'MAG': MAG, 'constrained': constrained, }, 'dbcon': dbcon, # anchors # 'min_ious': np.arange(0.125, 0.76, 0.125), 'min_ious': [0.25, 0.5, 0.75], 'fovs_to_use': None, 'constrained': constrained, } get_all_nucleus_anchors_gtruth(**gana_kwargs) # Add Expectation-Maximization inferred labels add_all_EM_inferred_labels(dbcon=dbcon) # Add unbiased labels to all the eval sets add_unbiased_labels_to_db(dbcon=dbcon) # create convenience virtual tables create_convenience_table_views(dbcon=dbcon)
def roc_pvals(clsgroup, ntrials=1000, unbiased=False): """Accuracy of inferred truth from NPs with/out algorithmic suggestions. This gets the bootstrap 95% confidence interval and p-values. """ print(f"\n> [GO GET COFFEE ...] Getting roc_pvals for {clsgroup.upper()})") # connect to sqlite database -- anchors dbcon = _connect_to_anchor_db(rpath) # first we read all anchors ubstr = ir._ubstr(unbiased) truthcol = f'{ubstr}EM_inferred_label_Ps' anchors = {} for evalset in ir.MAIN_EVALSET_NAMES: # read real anchors and remap labels ubstr = ir._ubstr(unbiased) tablename = f'v3.1_final_anchors_{evalset}_{ubstr}Ps_AreTruth' anchs = read_sql_query( f""" SELECT * FROM "{tablename}" ;""", dbcon) anchs = remap_classes_in_anchorsdf( anchors=anchs, clsgroup=clsgroup, also_ilabel=True, remove_ambiguous=True, who_determines_ambig='Ps', how_ambig_is_determined='EM', ) anchs.loc[:, 'ilabel'] = anchs.loc[:, 'EM_inferred_label_NPs'] anchors[evalset] = anchs # get bootstrap roc aucs cats = ['micro', 'macro'] roc_aucs = { cat: {evs: [] for evs in ir.MAIN_EVALSET_NAMES} for cat in cats } for _ in range(ntrials): for evalset in ir.MAIN_EVALSET_NAMES: x = anchors[evalset] idxs = np.random.randint(x.shape[0], size=x.shape[0]) _, _, rocauc = get_roc_and_auroc_for_who( anchors=x.iloc[idxs, :], truthcol=truthcol, probcol_prefix='EM_prob_', probcol_postfix='_NPs', ) for cat in cats: roc_aucs[cat][evalset].append(rocauc[cat]) # now get p-values pvals = {} for ev1, ev2 in combinations(ir.MAIN_EVALSET_NAMES, 2): for cat in cats: _, pvals[f'{ev1}_VS_{ev2}_{cat}'] = mannwhitneyu( roc_aucs[cat][ev1], roc_aucs[cat][ev2], alternative='two-sided') res = "" if clsgroup == 'main': res += "\n**********************************************************************" # noqa res += "\ni5_ParticipantAccuracy -> Ps_AreTruth_superimposed_auroc_curves.svg\n" # noqa res += "\nAccuracy of inferred truth from NPs with/out algorithmic suggestions.\n" # noqa res += f"This is the bootstrap AUROC comparison p-value with {ntrials} trials.\n" # noqa res += f'\n> AUROCs ({clsgroup.upper()}): ' res += '----------------------------\n' for cat, aucvals_dict in roc_aucs.items(): for ev, aucvals in aucvals_dict.items(): res += (f"{cat}: {ev}: {np.round(np.percentile(aucvals, 50), 3)} " f"({np.round(np.percentile(aucvals, 5), 3)}, " f"{np.round(np.percentile(aucvals, 95), 3)})\n") res += f'\n> pvals_intrarater ({clsgroup.upper()}, MANNWHITNEYU): ' res += '----------------------------\n' for k, v in pvals.items(): res += f"{k.replace('_', ' ')}: %.3f\n" % v print(res) with open(rfile, 'a') as f: f.write(res)
def get_and_plot_all_summary_counts(savedir: str, unbiased_is_truth: bool, whoistruth: str, who: str, evalset: str, clsgroup: str): """""" assert clsgroup in ['raw', 'main', 'super'] truthstr = f'{"UNBIASED_" if unbiased_is_truth else ""}{whoistruth}_AreTruth' # noqa where = opj(savedir, truthstr) _maybe_mkdir(where) _maybe_mkdir(opj(where, 'csv')) _maybe_mkdir(opj(where, 'plots')) clmap, class_list = _get_clmap(clsgroup) class_list.remove('AMBIGUOUS') clmap['undetected'] = 'undetected' clmap['DidNotAnnotateFOV'] = 'DidNotAnnotateFOV' # connect to sqlite database -- anchors dbcon_anchors = _connect_to_anchor_db(opj(savedir, '..', '..')) # restrict to relevant FOV subset and anchors out = get_fovs_annotated_by_almost_everyone( dbcon_anchors=dbcon_anchors, unbiased_is_truth=unbiased_is_truth, whoistruth=whoistruth, evalset=evalset, who=who) # group classes as needed out['anchors'] = remap_classes_in_anchorsdf(anchors=out['anchors'], clsgroup=clsgroup) # Get tally of nuclei was detected by AT LEAST 6 observers, etc cumulative_counts_table = get_summary_counts_table( anchors=out['anchors'], maxn=out['maxn'], unbiased_is_truth=unbiased_is_truth, whoistruth=whoistruth, who=who, class_list=class_list) detection_composition, Inferred_label_breakdown = \ _get_summary_percent_table( cumulative_counts_table, who=who, class_list=class_list) # save for reference prepend = f'{Interrater.TRUTHMETHOD}_{evalset}_{who}_{truthstr}' cumulative_counts_table.to_csv( opj(where, 'csv', f'{prepend}_counts_table.csv')) detection_composition.to_csv( opj(where, 'csv', f'{prepend}_detection_composition.csv')) Inferred_label_breakdown.to_csv( opj(where, 'csv', f'{prepend}_inferred_label_breakdown.csv')) # now plot _plot_counts_summaries( cumulative_counts_table=cumulative_counts_table, detection_composition=detection_composition, Inferred_label_breakdown=Inferred_label_breakdown, who=who, class_list=class_list, savename=opj(where, 'plots', f'{prepend}_count_summaries.svg'), )