def from_case_dataset(path): dataset = io_functions.load_case_collection(path) metadata = pd.DataFrame([{ "label": c.id, "group": c.group } for c in dataset]) return metadata
def transform(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath, reference: utils.URLPath, transargs: json.loads = None, sample: int = 0): """Transform dataset using a reference SOM. Args: recreate: Delete and recreate SOMs even if they already exist. sample: Number of samples to transform from each group, only useful for testing purposes. """ dataset = io_functions.load_case_collection(data, meta) # randomly sample 'sample' number cases from each group if sample: dataset = dataset.sample(sample) if transargs is None: transargs = DEFAULT_TRANSFORM_SOM_ARGS print(f"Loading referece from {reference}") model = io_functions.load_casesom(reference, **transargs) transform_dataset_to_som(model, dataset, output)
def main(): # dataset = io_functions.load_case_collection( # utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"), # utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/test.json.gz") # ) dataset = io_functions.load_case_collection( utils.URLPath("/data/flowcat-data/paper-cytometry/unused-data"), ) LOGGER.info("Anonymizing dataset: %s", dataset) OUTPUT = utils.URLPath( "/data/flowcat-data/paper-cytometry-resubmit/unused_data_anonymized") data_dir = OUTPUT / "data" data_dir.mkdir() for case in dataset: # if case.id != "ffc59330acb49e6fcf5e679dbabcd01e56991345": # continue for sample in case.samples: old_path = sample.complete_path new_path = data_dir / sample.path LOGGER.info("Saving %s sample to %s", case.id, new_path) new_path.parent.mkdir() anon_move(str(old_path), str(new_path))
def load_datasets(data_path): datasets = {} for d in filter(lambda d: d.is_dir(), data_path.iterdir()): datasets[d.name] = { "data": io_functions.load_case_collection(d, d + ".json"), "config": io_functions.load_json(d + "_config.json"), } return datasets
def create_tsne(data: utils.URLPath, meta: utils.URLPath, plotdir: utils.URLPath): """Generate tsne plots for a subsample of data. Args: data: Path to generated soms for cases. meta: Path to metadata json for cases. plotdir: Path to output plots for data. """ # data = flowcat.utils.URLPath("output/test-2019-08/som") # meta = flowcat.utils.URLPath("output/test-2019-08/som.json") # plotdir = flowcat.utils.URLPath("output/test-2019-08/tsne") cases = io_functions.load_case_collection(data, meta) # cases = cases.sample(20, flowcat.mappings.GROUPS) # flowcat.io_functions.save_json(cases.labels, plotdir / "case_ids.json") # labels = io_functions.load_json(plotdir / "case_ids.json") # cases = cases.filter(labels=labels) # print(cases) groups = np.array([case.group for case in cases]) colors = { "CLL": "red", "MBL": "dodgerblue", "MCL": "steelblue", "PL": "skyblue", "LPL": "limegreen", "MZL": "forestgreen", "FL": "springgreen", "HCL": "orchid", "normal": "darkgoldenrod", } plotdir.mkdir() for tube in ("1", "2", "3"): soms = [] for case in cases: sample = case.get_tube(tube, kind="som") sample.path = data / f"{case.id}_t{tube}.npy" som = sample.get_data().data.flatten() soms.append(som) somdata = np.array(soms) tsne = manifold.TSNE(n_components=2, perplexity=10) transformed = tsne.fit_transform(somdata) fig, ax = plt.subplots(figsize=(11, 7)) for group in mappings.GROUPS: gdata = transformed[groups == group] gx = gdata[:, 0] gy = gdata[:, 1] ax.scatter(gx, gy, c=colors[group], label=group) plt.legend() plt.savefig(plotdir / f"tsne_{tube}.png") plt.close("all")
def dataset(data: utils.URLPath, meta: utils.URLPath): """Print information on the given dataset.""" try: dataset = io_functions.load_case_collection(data, meta) except TypeError: dataset = io_functions.load_case_collection_from_caseinfo(data, meta) print(f"Loaded dataset from {meta}", dataset) print(dataset.group_count)
def main( data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath, reference_ids: utils.URLPath = None, reference: utils.URLPath = None, tensorboard_dir: utils.URLPath = None, modelargs: json.loads = None, transargs: json.loads = None, mode: str = "fit_transform", ): """ Train a SOM and use its weights to initialize individual SOM training. Args: data: Path to fcs data. meta: Path to dataset metadata, this should correctly reference fcs data. output: Path to output model and transformed cases. reference_ids: Optionally list ids to be used for reference SOM generation. reference: Optionally use pretrained model. modelargs: Optionally give specific options for reference SOM generation. transargs: Optionally give specific options for transforming individual SOMs. mode: Whether to fit or to transform. Default both. """ dataset = io_functions.load_case_collection(data, meta) if reference is None: reference_ids = io_functions.load_json(reference_ids) reference_dataset = dataset.filter(labels=reference_ids) print("Training reference SOM on", reference_dataset) reference = train_model(reference_dataset, modelargs=modelargs) reference_output = output / "reference" io_functions.save_casesom(reference, reference_output) reference = reference_output if mode == "fit": return if transargs is None: transargs = { "max_epochs": 4, "batch_size": 50000, "initial_radius": 4, "end_radius": 1, } model = io_functions.load_casesom(reference, tensorboard_dir=tensorboard_dir, **transargs) som_output = output / "som" transform_cases(dataset, model, som_output)
def main( fcsdata: utils.URLPath, fcsmeta: utils.URLPath, somdata: utils.URLPath, output: utils.URLPath, ): fcs_dataset = io_functions.load_case_collection(fcsdata, fcsmeta) try: som_config = io_functions.load_json(somdata + "_config.json") except FileNotFoundError: som_config = None if som_config is None: selected_markers = fcs_dataset.selected_markers else: selected_markers = {t: d["channels"] for t, d in som_config.items()} tubes = ("1", "2", "3") model = quantization_error_model() sess = tf.Session() results = [] for fcscase in fcs_dataset: print(fcscase) for tube in tubes: fcssample = fcscase.get_tube(tube, kind="fcs").get_data() somsample = get_som_data(fcscase.id, tube, somdata, selected_markers[tube]) error = sample_quantization_error(fcssample, somsample, model, sess) results.append((fcscase.id, tube, error)) stats = {} stats["mean"] = { t: sum(r[-1] for r in results if r[1] == t) / len(results) for t in tubes } stats["variance"] = { t: sum( np.power(r[-1] - stats["mean"][t], 2) for r in results if r[1] == t) / len(results) for t in tubes } print("Mean quantization error", stats) io_functions.save_json(results, output / "quantization_error.json") io_functions.save_json(stats, output / "quantization_error_mean.json")
def main( data: utils.URLPath = None, model: utils.URLPath = None, preds: utils.URLPath = None, output: utils.URLPath = None, ): data = utils.URLPath("/data/flowcat-data/paper-cytometry/som/unused") dataset = io_functions.load_case_collection(data, data + ".json.gz") # output = utils.URLPath("/data/flowcat-data/paper-cytometry/tsne") output = utils.URLPath("teststuff_unused_style") output.mkdir() # predictions = io_functions.load_json(utils.URLPath("/data/flowcat-data/paper-cytometry/tsne/prediction.json")) model = SOMClassifier.load(utils.URLPath("/data/flowcat-data/paper-cytometry/classifier")) som_tsne(dataset, model, output)
def main(args): """Load case ids from json file to filter cases and train and save the created model.""" output_dir = args.output dataset = io_functions.load_case_collection(args.data, args.meta) selected_labels = io_functions.load_json(args.cases) selected, _ = dataset.filter_reasons(labels=selected_labels) if args.tensorboard: tensorboard_dir = output_dir / "tensorboard" else: tensorboard_dir = None model = train_model(selected, markers=args.markers, tensorboard=tensorboard_dir, marker_name_only=args.marker_name_only) io_functions.save_casesom(model, output_dir)
def filter( data: utils.URLPath, filters: json.loads, output: utils.URLPath = None, meta: utils.URLPath = None, sample: int = 0, move_samples: bool = False, ): """Filter data on the given filters and output resulting dataset metadata to destination. Args: data: Path to fcs data. meta: Path to dataset metadata. output: Path to output for metadata. filters: Filters for individual cases. sample: Number of cases per group. move_samples: Destination will also include sample data. """ print(f"Loading existing dataset from {data} with metadata in {meta}") try: dataset = io_functions.load_case_collection(data, meta) except TypeError: dataset = io_functions.load_case_collection_from_caseinfo(data, meta) dataset = dataset.filter(**filters) if sample: dataset = dataset.sample(sample) print(f"Filtering down to {dataset}") print(dataset.group_count) if output: print("Saving", dataset, f"to {output}") if move_samples: io_functions.save_case_collection_with_data(dataset, output) else: io_functions.save_case_collection(dataset, output)
def main(args): """Load a model with given transforming arguments and transform individual cases.""" cases = io_functions.load_case_collection(args.data, args.meta) # cases = cases.sample(1, groups=["CLL", "normal"]) selected_markers = cases.selected_markers marker_name_only = False if args.tensorboard: tensorboard_dir = args.output / "tensorboard" else: tensorboard_dir = None # scaler = "RefitMinMaxScaler" scaler = args.scaler # Training parameters for the model can be respecified, the only difference # between transform and normal traninig, is that after a transformation is # completed, the original weights will be restored to the model. model = casesom.CaseSom( tubes=selected_markers, tensorboard_dir=tensorboard_dir, modelargs={ "marker_name_only": marker_name_only, "max_epochs": 5, "batch_size": 50000, "initial_radius": int(args.size / 2), "end_radius": 1, "radius_cooling": "linear", # "marker_images": sommodels.fcssom.MARKER_IMAGES_NAME_ONLY, "map_type": "toroid", "dims": (args.size, args.size, -1), "scaler": scaler, } ) transform_cases(cases, model, args.output)
def load_matching_som_dataset( fcs_dataset: "CaseCollection", som_dataset_path: utils.URLPath) -> "CaseCollection": """Check whether the given som path contains a complete SOM dataset matching the given FCS dataset. Otherwise return None. """ try: som_dataset = io_functions.load_case_collection(som_dataset_path) except Exception as e: LOGGER.warning("Loading existing dataset at %s produced error: %s", som_dataset_path, e) return None same_case_number = len(fcs_dataset) == len(som_dataset) same_sample_count = len([1 for c in fcs_dataset for s in c.samples]) == len( [1 for c in som_dataset for s in c.samples]) if not (same_case_number and same_sample_count): LOGGER.warning( "Existing som dataset at %s does not match number of samples or cases of given FCS dataset", som_dataset_path) return None return som_dataset
def load_case_collection(data: str, meta: str = None): data = utils.URLPath(data) if meta is not None: meta = utils.URLPath(meta) return io_functions.load_case_collection(data, meta)
def main(data: utils.URLPath, reference: utils.URLPath, output: utils.URLPath): """ """ cases = io_functions.load_case_collection(data, data / data.name + ".json") default_settings = { "max_epochs": 4, "initial_learning_rate": 0.05, "end_learning_rate": 0.01, "batch_size": 50000, "initial_radius": 4, "end_radius": 1, } # settings = [ # ("learning_rate_001_0001", {"initial_learning_rate": 0.01, "end_learning_rate": 0.001}), # ("learning_rate_001_001", {"initial_learning_rate": 0.01, "end_learning_rate": 0.01}), # ("learning_rate_005_0001", {"initial_learning_rate": 0.05, "end_learning_rate": 0.001}), # ("learning_rate_005_001", {"initial_learning_rate": 0.05, "end_learning_rate": 0.01}), # ("learning_rate_005_005", {"initial_learning_rate": 0.05, "end_learning_rate": 0.05}), # ("learning_rate_05_0001", {"initial_learning_rate": 0.5, "end_learning_rate": 0.001}), # ("learning_rate_05_001", {"initial_learning_rate": 0.5, "end_learning_rate": 0.01}), # ("learning_rate_05_01", {"initial_learning_rate": 0.5, "end_learning_rate": 0.1}), # ("learning_rate_05_05", {"initial_learning_rate": 0.5, "end_learning_rate": 0.5}), # ] settings = [ ("radius_24_1", { "initial_radius": 24, "end_radius": 1 }), ("radius_24_2", { "initial_radius": 24, "end_radius": 2 }), ("radius_24_1", { "initial_radius": 16, "end_radius": 1 }), ("radius_16_2", { "initial_radius": 16, "end_radius": 2 }), ("radius_8_1", { "initial_radius": 8, "end_radius": 1 }), ("radius_8_2", { "initial_radius": 8, "end_radius": 2 }), ("radius_4_1", { "initial_radius": 4, "end_radius": 1 }), ("radius_4_2", { "initial_radius": 4, "end_radius": 2 }), ] for name, setting in settings: model = io_functions.load_casesom( reference, **{ **default_settings, **setting }, ) transform_data(cases, model, output / name)
def lenient_load_collection(data, meta): try: dataset = io_functions.load_case_collection(data, meta) except TypeError: dataset = io_functions.load_case_collection_from_caseinfo(data, meta) return dataset
""" Acquire FCS information needed for Miflowcyt document. Also roughly check whether we have strongly diverging data in our dataset. """ from flowcat import dataset as fc_dataset, io_functions, utils import fcsparser def section(text, level=4, deco="#"): deco_text = deco * level section_text = f"{deco_text} {text} {deco_text}" print(section_text) train_dataset = io_functions.load_case_collection(utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"), utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/train.json.gz")) test_dataset = io_functions.load_case_collection(utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"), utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/test.json.gz")) print("Loading all data used in paper analysis.") dataset = train_dataset + test_dataset print(dataset) section("Get info for case 0") case = dataset[0] print(case) sample = case.samples[0] meta, data = fcsparser.parse(sample.complete_path) for i in range(1, 13): name = f"$P{i}S" voltage = f"$P{i}V"
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath): """ Args: data: Path to fcs dataset data meta: Path to fcs dataset metainformation output: Output path """ tubes = ("1", "2") sample_size = 512 # group_mapping = mappings.GROUP_MAPS["6class"] # mapping = group_mapping["map"] mapping = None groups = mappings.GROUPS # groups = group_mapping["groups"] dataset = io_functions.load_case_collection(data, meta) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=groups) validate, train = dataset.create_split(50) print(train.group_count) # train = train.balance(1000).shuffle() train = train.sample(100).shuffle() print(train.group_count) group_count = train.group_count group_weights = classification_utils.calculate_group_weights(group_count) group_weights = { i: group_weights.get(g, 1.0) for i, g in enumerate(groups) } io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") binarizer = LabelBinarizer() binarizer.fit(groups) train_seq = FCSSequence(train, binarizer, tubes=tubes, sample_size=sample_size, batch_size=64) validate_seq = FCSSequence(validate, binarizer, tubes=tubes, sample_size=sample_size, batch_size=128) config = { "tubes": tubes, "groups": groups, } io_functions.save_json(config, output / "config.json") # for tube in tubes: # x, y, z = selected_tubes[tube]["dims"] # selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) cost_mapping = { ("CLL", "MBL"): 0.5, ("MBL", "CLL"): 0.5, ("MCL", "PL"): 0.5, ("PL", "MCL"): 0.5, ("LPL", "MZL"): 0.5, ("MZL", "LPL"): 0.5, ("CLL", "normal"): 2, ("MBL", "normal"): 2, ("MCL", "normal"): 2, ("PL", "normal"): 2, ("LPL", "normal"): 2, ("MZL", "normal"): 2, ("FL", "normal"): 2, ("HCL", "normal"): 2, } cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups) model = create_fcs_model(train_seq.xshape, train_seq.yshape, global_decay=5e-5) model.compile( # loss="categorical_crossentropy", # loss=keras.losses.CategoricalCrossentropy(), loss=classification_utils.WeightedCategoricalCrossentropy(cost_matrix), # loss="binary_crossentropy", optimizer="adam", # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon), metrics=[ "acc", # keras.metrics.CategoricalAccuracy(), # keras.metrics.TopKCategoricalAccuracy(k=2), # top2_acc, ]) model.summary() tensorboard_dir = str(output / "tensorboard") tensorboard_callback = keras.callbacks.TensorBoard( log_dir=str(tensorboard_dir), histogram_freq=5, write_grads=True, write_images=True, ) nan_callback = keras.callbacks.TerminateOnNaN() model.fit_generator( epochs=20, shuffle=True, callbacks=[ # tensorboard_callback, nan_callback ], class_weight=group_weights, generator=train_seq, validation_data=validate_seq) model.save(str(output / "model.h5")) io_functions.save_joblib(binarizer, output / "binarizer.joblib") preds = [] for pred in model.predict_generator(validate_seq): preds.append(pred) pred_arr = np.array(preds) pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validate_seq.true_labels generate_all_metrics(true_labels, pred_labels, { "groups": groups, "map": {} }, output / "unmapped") for map_name, mapping in mappings.GROUP_MAPS.items(): output_path = output / map_name # skip if more groups in map print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(groups): continue generate_all_metrics(true_labels, pred_labels, mapping, output_path)
def load_datasets(): berlin_dataset = io_functions.load_case_collection(**BERLIN_DATA) munich_dataset = io_functions.load_case_collection(**MUNICH_DATA) return berlin_dataset, munich_dataset
# pylint: skip-file # flake8: noqa import numpy as np import pandas as pd from flowcat.utils import URLPath from flowcat.io_functions import load_case_collection from flowcat.dataset import case_dataset datapath = URLPath("output/test-2019-08/som") metapath = URLPath("output/test-2019-08/som.json") cases = load_case_collection(datapath, metapath) print(cases) nppath = URLPath("output/test-2019-08/somnp2") nppath.mkdir() all_num = len(cases) for i, case in enumerate(cases): print(f"Converting {i}/{all_num}") for somsample in case.samples: somsample.path = datapath / f"{case.id}_t{somsample.tube}.csv" somdata = pd.read_csv(str(somsample.path), index_col=0) somarray = somdata.values somarray = somarray.reshape((32, 32, -1)) newpath = nppath / f"{case.id}_t{somsample.tube}.npy" np.save(str(newpath), somarray)
def main(data: utils.URLPath, model: utils.URLPath, output: utils.URLPath): dataset = io_functions.load_case_collection(data, data + ".json") dataset.set_data_path(utils.URLPath("")) model = SOMClassifier.load(model) validate = model.get_validation_data(dataset) val_seq = model.create_sequence(validate) trues = np.concatenate([val_seq[i][1] for i in range(len(val_seq))]) preds = np.array([p for p in model.model.predict_generator(val_seq)]) create_roc_results(trues, preds, output / "roc", model) create_threshold_results(trues, preds, output / "threshold", model) # tsne of result vectors embedding_path = output / "embedding-preds" embedding_path.mkdir() pred_labels = val_seq.true_labels groups = model.config["groups"] groups.remove("normal") groups = ["normal", *groups] all_groups = groups + ["AML", "MM", "HCLv"] colors = sns.cubehelix_palette(len(all_groups), rot=4, dark=0.30) perplexity = 50 # tsne of intermediate layers intermediate_model = keras.Model( inputs=model.model.input, outputs=model.model.get_layer("concatenate_1").output) intermed_preds = np.array( [p for p in intermediate_model.predict_generator(val_seq)]) # unknown data udata = utils.URLPath("output/unknown-cohorts-processing/som/som") udataset = io_functions.load_case_collection(udata, udata + ".json") udataset.set_data_path(utils.URLPath("")) un_seq = model.create_sequence(udataset) intermed_upreds = np.array( [p for p in intermediate_model.predict_generator(un_seq)]) all_intermed = np.concatenate((intermed_preds, intermed_upreds)) all_labels = pred_labels + un_seq.true_labels umap_inter_all = UMAP(n_neighbors=30).fit_transform(all_intermed) plot_embedded(umap_inter_all, all_labels, all_groups, colors=colors).savefig(str(embedding_path / f"umap_intermediate_all.png"), dpi=300) tsne_inter_all = manifold.TSNE( perplexity=perplexity).fit_transform(all_intermed) plot_embedded( tsne_inter_all, all_labels, all_groups, colors=colors).savefig(str( embedding_path / f"tsne_intermediate_all_p{perplexity}.png"), dpi=300) # create som tsne for known and unknown data all_cases = validate.cases + udataset.cases case_data = [] for case in all_cases: somdata = np.concatenate([ case.get_tube(tube, kind="som").get_data().data for tube in model.config["tubes"] ], axis=2).flatten() case_data.append(somdata) case_data = np.array(case_data) perplexity = 50 umap_som_all = UMAP(n_neighbors=30).fit_transform(case_data) plot_embedded(umap_som_all, all_labels, all_groups, colors=colors).savefig( str(embedding_path / f"umap_som_all.png"), dpi=300) tsne_som_all = manifold.TSNE( perplexity=perplexity).fit_transform(case_data) plot_embedded(tsne_som_all, all_labels, all_groups, colors=colors).savefig( str(embedding_path / f"tsne_som_all_p{perplexity}.png"), dpi=300) # plot legend fig = plt.figure() patches = [ mpl.patches.Patch(color=color, label=group) for group, color in zip(all_groups, colors) ] fig.legend(patches, all_groups, loc='center', frameon=False) fig.savefig(str(embedding_path / "legend.png"), dpi=300)
def read_sel_markers(selected_markers) -> "Dict[(Marker, str), float]": """read selcted markers from a file and convert to Marker object""" markers = list(selected_markers.values())[0] marker_names = [] for marker in markers: marker_names.append( Marker(antibody=Marker.name_to_marker(marker).antibody, color=None)) selected_markers = {"1": marker_names} return selected_markers dataset = io_functions.load_case_collection( utils.URLPath("/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F"), utils.URLPath( "/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F_meta/train.json.gz" )) references = io_functions.load_json( utils.URLPath( "/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F_meta/references.json" )) OUTPUT = utils.URLPath("/data/flowcat-data/2020_Nov_rerun/Merged_SOM/MLL9F") setup_logging(None, "generate ref SOM for merged FCS") ref_dataset = dataset.filter(labels=references) tensorboard_dir = None
from fcg_logging import create_logging_handlers, setup_logging LOGPATH = utils.URLPath("logs/assess_quality_{utils.create_stamp()}.log") LOGGER = setup_logging(LOGPATH, "assess_quality") ungated_samples = list(utils.URLPath("output/ungated/data").glob("**/*.LMD")) ungated_sample_count = len(ungated_samples) gated_samples = list( utils.URLPath("output/gated_single/data").glob("**/*.LMD")) gated_sample_count = len(gated_samples) LOGGER.info("Gated/Ungated successful FCS count: %d/%d (%s %%)", gated_sample_count, ungated_sample_count, gated_sample_count / ungated_sample_count) sample_dataset = io_functions.load_case_collection( utils.URLPath("output/samples")) LOGGER.info(sample_dataset) def foldername(path): return str(utils.URLPath(path.parent.name, path.name)) def ppp(v): LOGGER.info(v) return v gated_samples_names = list(map(lambda p: foldername(p), gated_samples)) missing_paths = list(
from flowcat import io_functions, utils from flowcat.plots import som as fc_somplot LOGPATH = utils.URLPath("logs/visualize_datasets_{utils.create_stamp()}.log") LOGGER = utils.logs.setup_logging(LOGPATH, "visualize_datasets") OUTPUT = utils.URLPath("output/visualization/soms-ungated") # OUTPUT.mkdir() # # som_dataset = io_functions.load_case_collection(utils.URLPath("output/classifier_ungated/som")) # # # testsample = som_dataset[0].samples[0] # # for case in som_dataset.filter(groups=["CLL"]): # testsample = case.get_tube("1", kind="som") # LOGGER.info(testsample) # somdata = testsample.get_data() # fig = fc_somplot.plot_som_grid(somdata, channels=["SS INT LIN", "CD45-KrOr", None]) # fig.savefig(str(OUTPUT / f"test_{case.id}.png")) OUTPUT = utils.URLPath("output/visualization/soms-original") som_dataset = io_functions.load_case_collection(utils.URLPath("/data/flowcat-data/paper-cytometry/som/train"), utils.URLPath("/data/flowcat-data/paper-cytometry/som/train.json.gz")) OUTPUT.mkdir() for case in som_dataset.filter(groups=["CLL"]): testsample = case.get_tube("1", kind="som") LOGGER.info(testsample) somdata = testsample.get_data() fig = fc_somplot.plot_som_grid(somdata, channels=["SS INT LIN", "CD45-KrOr", None]) fig.savefig(str(OUTPUT / f"test_{case.id}.png"))
from flowcat import io_functions, utils, seed as fc_seed INPUT = { "data": utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"), "meta": utils.URLPath( "/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/train.json.gz" ), "meta_test": utils.URLPath( "/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/test.json.gz" ), } train_dataset = io_functions.load_case_collection(INPUT["data"], INPUT["meta_test"]) sorted_cases = sorted(train_dataset, key=lambda c: c.infiltration if c.infiltration > 0.0 else 1000) perc01_count = 0 group_count = defaultdict(int) for case in sorted_cases[:100]: print("Minimal infiltration sample:", case, case.infiltration) if case.infiltration == 0.1: perc01_count += 1 group_count[case.group] += 1 print(perc01_count) print(group_count)
def main(data: utils.URLPath, meta: utils.URLPath, reference: utils.URLPath, model: utils.URLPath): data, meta, soms, model = map(utils.URLPath, [ "/data/flowcat-data/mll-flowdata/decCLL-9F", "output/0-final-dataset/train.json.gz", "output/som-fix-test/soms-test/som_r4_1", "output/0-final/classifier-minmax-new", ]) sommodel = utils.URLPath("output/som-fix-test/unjoined-ref") sommodel = io_functions.load_casesom(sommodel) output = utils.URLPath("output/0-final/model-analysis/saliency") output.mkdir() dataset = io_functions.load_case_collection(data, meta) soms = som_dataset.SOMDataset.from_path(soms) model = SaliencySOMClassifier.load(model) val_dataset = model.get_validation_data(dataset) val_seq = model.create_sequence(soms) selected_labels = [ "c3a6098bd5216c7d1f958396dd31bd6ef1646c18", "df726c162ed728c2886107e665ad931e5bf0baae", "3eb03bea6651c302ac013f187b288ee990889b29", "e539b3ec66b1c9d7a0aae1fbd37c19c7ac86a18c", "762a2a19d1913383f41ead7b5ef74a8133d67847", "bbfafb3d9053e212279aaada5faf23eddf4a5926", "9503bfad60524615a06613cfbffa3861fb66ede3", ] sel_dataset = dataset.filter(labels=selected_labels) # annotate each fcs point with saliency info session = tf.Session() bmu_calc = calculate_bmu_indexes() normalize = mpl.colors.Normalize(vmin=0, vmax=1) case = sel_dataset[0] for case in sel_dataset: case_output = output / f"{case.id}_g{case.group}" case_output.mkdir() print("Plotting", case) # plot som and saliency activations result = model.calculate_saliency(val_seq, case, case.group, maximization=False) xdata, _ = val_seq.get_batch_by_label([case.id]) xdata = [x[0, ...] for x in xdata] for tube in ("1", "2", "3"): fig = plot_saliency_som_map(model, xdata, result, tube, ("CD45-KrOr", "SS INT LIN", "CD19-APCA750")) fig.savefig(str(case_output / f"t{tube}_overlay.png")) fig = plot_saliency_scatterplot(model, bmu_calc, session, case, tube, xdata, result, norm=normalize) fig.savefig(str(case_output / f"t{tube}_scatter_saliency.png")) for case in sel_dataset: case_output = output / f"maxall_{case.id}_g{case.group}" case_output.mkdir() print("Plotting", case) # plot som and saliency activations result = model.calculate_saliency(val_seq, case, case.group, maximization=False) for r in result: print("Max", np.max(r)) xdata, _ = val_seq.get_batch_by_label([case.id]) xdata = [x[0, ...] for x in xdata] for tube in ("1", "2", "3"): fig = plot_saliency_som_map(model, xdata, result, tube, ("CD45-KrOr", "SS INT LIN", "CD19-APCA750")) fig.savefig(str(case_output / f"t{tube}_overlay.png")) fig = plot_saliency_scatterplot(model, bmu_calc, session, case, tube, xdata, result, norm=normalize) fig.savefig(str(case_output / f"t{tube}_scatter_saliency.png")) # case_som = soms.get_labels([case.id]).iloc[0] hcls = val_dataset.filter(groups=["HCL"]) from collections import defaultdict max_vals = defaultdict(lambda: defaultdict(list)) mean_vals = defaultdict(lambda: defaultdict(list)) for case in hcls: print(case) gradient = model.calculate_saliency(val_seq, case, case.group, maximization=False) for i, (tube, markers) in enumerate(model.config["tubes"].items()): tgrad = gradient[i] for i, marker in enumerate(markers["channels"]): mgrad = tgrad[:, :, i] gmax = np.max(mgrad) max_vals[tube][marker].append(gmax) gmean = np.mean(mgrad) mean_vals[tube][marker].append(gmean) max_markers = defaultdict(list) for tube, markers in model.config["tubes"].items(): for marker in markers["channels"]: print("Max", tube, marker, np.mean(max_vals[tube][marker])) print("Mean", tube, marker, np.mean(mean_vals[tube][marker])) max_markers[tube].append((marker, np.mean(max_vals[tube][marker])))
( fcs_data[tube], flowsom_data[tube], flowcat_data[tube], ), ( {"s": 1, "marker": ".", "color": "grey", "label": "fcs"}, {"s": 8, "marker": ".", "color": "blue", "label": "flowCat", "alpha": 0.5}, {"s": 8, "marker": ".", "color": "red", "label": "flowSOM", "alpha": 0.5}, ), tube, output / name) cases = io_functions.load_case_collection( utils.URLPath("output/4-flowsom-cmp/samples"), utils.URLPath("output/4-flowsom-cmp/samples/samples.json")) # Compare flowsom results with flowcat results. Do keep in mind that they are # scaled differently # flowsom_path = utils.URLPath("output/4-flowsom-cmp/flowsom-samples") # flowcat_path = utils.URLPath("output/4-flowsom-cmp/flowcat-denovo") # flowcat_ref_path = flowcat.utils.URLPath("output/4-flowsom-cmp/flowcat-refsom") output = utils.URLPath("output/4-flowsom-cmp/figures-refit") tubes = ("1", "2", "3") groups = set(cases.groups) from collections import defaultdict
def main(data: utils.URLPath, meta: utils.URLPath, reference: utils.URLPath, model: utils.URLPath): data, meta, soms, model = map(utils.URLPath, [ "/data/flowcat-data/mll-flowdata/decCLL-9F", "output/0-final-dataset/train.json.gz", "output/som-fix-test/soms-test/som_r4_1", "output/0-final/classifier-minmax-new", ]) dataset = io_functions.load_case_collection(data, meta) soms = som_dataset.SOMDataset.from_path(soms) model = SaliencySOMClassifier.load(model) val_dataset = model.get_validation_data(dataset) val_seq = model.create_sequence(soms) # printing out weights and biases, unsure whether they actually contain # information # in theory we could extend that to attempt to describe them as gates tube = "3" weights, biases = model.model.layers[int(tube) + 2].get_weights() for j, chname in enumerate(model.config["tubes"][tube]["channels"]): ch_mean_weight = np.mean(weights[:, :, j, :]) print(j, chname, ch_mean_weight) for i in range(weights.shape[-1]): mean_weight = np.mean(weights[:, :, :, i]) print(i, mean_weight, biases[i]) for j, chname in enumerate(model.config["tubes"]["1"]["channels"]): print(i, j, chname) print(weights[:, :, j, i]) # zero out specific columns and see how that impacts performance output = utils.URLPath("output/0-final/model-analysis/occlusion") for group in model.config["groups"]: print(group) sel_cases = val_dataset.filter(groups=[group]) avg_results = model.channel_occlusion(sel_cases, val_seq) print(sorted(avg_results, key=lambda t: t[2], reverse=True)) io_functions.save_json(avg_results, output / f"{group}_avg_std.json") # case_som = soms.get_labels([case.id]).iloc[0] hcls = val_dataset.filter(groups=["HCL"]) from collections import defaultdict max_vals = defaultdict(lambda: defaultdict(list)) mean_vals = defaultdict(lambda: defaultdict(list)) for case in hcls: print(case) gradient = model.calculate_saliency(val_seq, case, case.group, maximization=False) for i, (tube, markers) in enumerate(model.config["tubes"].items()): tgrad = gradient[i] for i, marker in enumerate(markers["channels"]): mgrad = tgrad[:, :, i] gmax = np.max(mgrad) max_vals[tube][marker].append(gmax) gmean = np.mean(mgrad) mean_vals[tube][marker].append(gmean) max_markers = defaultdict(list) for tube, markers in model.config["tubes"].items(): for marker in markers["channels"]: print("Max", tube, marker, np.mean(max_vals[tube][marker])) print("Mean", tube, marker, np.mean(mean_vals[tube][marker])) max_markers[tube].append((marker, np.mean(max_vals[tube][marker]))) for tube in model.config["tubes"]: print("Tube", tube) print("\n".join(": ".join((t[0], str(t[1]))) for t in sorted( max_markers[tube], key=lambda t: t[1], reverse=True))) c_model = MLLDATA / "mll-sommaps/models/relunet_samplescaled_sommap_6class/model_0.h5" c_labels = MLLDATA / "mll-sommaps/output/relunet_samplescaled_sommap_6class/test_labels.json" c_preds = MLLDATA / "mll-sommaps/models/relunet_samplescaled_sommap_6class/predictions_0.csv" c_config = MLLDATA / "mll-sommaps/output/relunet_samplescaled_sommap_6class/config.json" c_cases = MLLDATA / "mll-flowdata/CLL-9F" c_sommaps = MLLDATA / "mll-sommaps/sample_maps/selected1_toroid_s32" c_misclass = MLLDATA / "mll-sommaps/misclassifications/" c_tube = [1, 2] # load datasets somdataset = sd.SOMDataset.from_path(c_sommaps) cases = cc.CaseCollection.from_path(c_cases, how="case_info.json") # filter datasets test_labels = flowutils.load_json(c_labels) filtered_cases = cases.filter(labels=test_labels) somdataset.data[1] = somdataset.data[1].loc[test_labels, :] # get mapping config = flowutils.load_json(c_config) groupinfo = mappings.GROUP_MAPS[config["c_groupmap"]] dataset = cd.CombinedDataset(filtered_cases, { dd.Dataset.from_str('SOM'): somdataset, dd.Dataset.from_str('FCS'): filtered_cases }, group_names=groupinfo['groups']) # modify mapping dataset.set_mapping(groupinfo) xoutputs = [ loaders.loader_builder( loaders.Map2DLoader.create_inferred, tube=1, sel_count="counts", pad_width=1, ), loaders.loader_builder( loaders.Map2DLoader.create_inferred, tube=2, sel_count="counts", pad_width=1, ) ] dataset = loaders.DatasetSequence.from_data(dataset, xoutputs, batch_size=1, draw_method="sequential") predictions = pd.read_csv(c_preds, index_col=0) predictions = add_correct_magnitude(predictions) predictions = add_infiltration(predictions, cases) misclass_labels = ['507777582649cbed8dfb3fe552a6f34f8b6c28e3'] for label in misclass_labels: label_path = pathlib.Path(f"{c_misclass}/{label}") if not label_path.exists(): label_path.mkdir() case = cases.get_label(label) #get the actual and the predicited class corr_group = predictions.loc[case.id, "correct"] pred_group = predictions.loc[case.id, "pred"] classes = [corr_group, pred_group] gradients = plotting.calc_saliency(dataset, case, c_model, classes=classes) for tube in c_tube: heatmaps = plotting.draw_saliency_heatmap(case, gradients, classes, tube) for idx, heatmap in enumerate(heatmaps): plotting.save_figure( heatmap, f"{c_misclass}/{label}/{classes[idx]}_tube_{tube}_saliency_heatmap.png" ) scatterplots = plotting.plot_tube(case, tube, gradients[tube - 1], classes=classes, sommappath=c_sommaps) for idx, scatterplot in enumerate(scatterplots): plotting.save_figure( scatterplot, f"{c_misclass}/{label}/{classes[idx]}_tube_{tube}_scatterplots.png" )