def main(args): output_dir = args.output / args.name dataset = flowcat.CaseCollection.from_path(args.input, metapath=args.meta) selected_labels = io_functions.load_json("data/selected_cases.json") selected, _ = dataset.filter_reasons(labels=selected_labels) selected = dataset.sample(count=1, groups=["CLL", "normal"]) print(selected.labels) joined_tubes = io_functions.load_json( "output/00-dataset-test/munich_bonn_tubes.json") print(joined_tubes) # TODO: Generate a SOM for all tubes for the given labels. # Visualize using tensorboard # Save everything into a single, folder which we can use in the next script # to create single SOMs model = som.CaseSingleSom(tube=1, materials=flowcat.ALLOWED_MATERIALS, markers=joined_tubes["1"], marker_name_only=True, max_epochs=10, batch_size=10000, marker_images=som.fcssom.MARKER_IMAGES_NAME_ONLY, map_type="toroid", tensorboard_dir=output_dir / "tensorboard", dims=(32, 32, -1)) model.train(selected) model.save(output_dir / "model")
def main(args): dataset = som_dataset.SOMDataset.from_path(args.input) val = args.val train = args.train OUTPUT = args.output PANEL = args.panel bal = args.bal # set the groups according to the panel if PANEL == "MLL": groups = GROUPS elif PANEL == "ERLANGEN": groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] else: groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] tubes = ("1") mapping = None balance = dict((key, bal) for key in groups) config = classifier.SOMClassifierConfig( **{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, "train_epochs": 20, }) val = io_functions.load_json(val) validate_dataset = dataset.filter(labels=val) labels = io_functions.load_json(train) train_dataset = dataset.filter(labels=labels) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( train_dataset, split_ratio=0.9, groups=groups, mapping=mapping, balance=balance, val_dataset=validate_dataset) print(train_dataset.group_count) print(validate_dataset.group_count) model = fc_api.train_som_classifier(train_dataset, validate_dataset, config) model.save(OUTPUT) model.save_information(OUTPUT)
def load(cls, path: utils.URLPath): """Load classifier model from the given path.""" config = io_functions.load_json(path / "config.json") model = keras.models.load_model(str(path / "model.h5"), ) binarizer = io_functions.load_joblib(path / "binarizer.joblib") data_ids = { "validation": io_functions.load_json(path / "ids_validate.json"), "train": io_functions.load_json(path / "ids_train.json"), } return cls(model, binarizer, config, data_ids=data_ids)
def main(): bonn_config = io_functions.load_json("output/00-dataset-test/bonn_config.json") munich_config = io_functions.load_json("output/00-dataset-test/train_config.json") selected = {} for tube, markers in bonn_config["selected_markers"].items(): selected[tube] = [] munich_tube = [remove_stem(m) for m in munich_config["selected_markers"][tube]] for marker in markers: marker_stem = remove_stem(marker) if marker_stem in munich_tube: selected[tube].append(marker_stem) print(selected) io_functions.save_json(selected, "output/00-dataset-test/munich_bonn_tubes.json")
def load_datasets(data_path): datasets = {} for d in filter(lambda d: d.is_dir(), data_path.iterdir()): datasets[d.name] = { "data": io_functions.load_case_collection(d, d + ".json"), "config": io_functions.load_json(d + "_config.json"), } return datasets
def load_flowcat_data(case_id, flowcat_path, tubes): """Load given flowcat data into a dict of pandas dataframes.""" soms = {} config = io_functions.load_json(flowcat_path + "_config.json") for tube in tubes: sompath = flowcat_path / f"{case_id}_t{tube}.npy" channels = config[tube]["channels"] soms[tube] = pd.DataFrame(np.load(sompath).reshape((-1, len(channels))), columns=channels) return soms
def main( data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath, reference_ids: utils.URLPath = None, reference: utils.URLPath = None, tensorboard_dir: utils.URLPath = None, modelargs: json.loads = None, transargs: json.loads = None, mode: str = "fit_transform", ): """ Train a SOM and use its weights to initialize individual SOM training. Args: data: Path to fcs data. meta: Path to dataset metadata, this should correctly reference fcs data. output: Path to output model and transformed cases. reference_ids: Optionally list ids to be used for reference SOM generation. reference: Optionally use pretrained model. modelargs: Optionally give specific options for reference SOM generation. transargs: Optionally give specific options for transforming individual SOMs. mode: Whether to fit or to transform. Default both. """ dataset = io_functions.load_case_collection(data, meta) if reference is None: reference_ids = io_functions.load_json(reference_ids) reference_dataset = dataset.filter(labels=reference_ids) print("Training reference SOM on", reference_dataset) reference = train_model(reference_dataset, modelargs=modelargs) reference_output = output / "reference" io_functions.save_casesom(reference, reference_output) reference = reference_output if mode == "fit": return if transargs is None: transargs = { "max_epochs": 4, "batch_size": 50000, "initial_radius": 4, "end_radius": 1, } model = io_functions.load_casesom(reference, tensorboard_dir=tensorboard_dir, **transargs) som_output = output / "som" transform_cases(dataset, model, som_output)
def main( fcsdata: utils.URLPath, fcsmeta: utils.URLPath, somdata: utils.URLPath, output: utils.URLPath, ): fcs_dataset = io_functions.load_case_collection(fcsdata, fcsmeta) try: som_config = io_functions.load_json(somdata + "_config.json") except FileNotFoundError: som_config = None if som_config is None: selected_markers = fcs_dataset.selected_markers else: selected_markers = {t: d["channels"] for t, d in som_config.items()} tubes = ("1", "2", "3") model = quantization_error_model() sess = tf.Session() results = [] for fcscase in fcs_dataset: print(fcscase) for tube in tubes: fcssample = fcscase.get_tube(tube, kind="fcs").get_data() somsample = get_som_data(fcscase.id, tube, somdata, selected_markers[tube]) error = sample_quantization_error(fcssample, somsample, model, sess) results.append((fcscase.id, tube, error)) stats = {} stats["mean"] = { t: sum(r[-1] for r in results if r[1] == t) / len(results) for t in tubes } stats["variance"] = { t: sum( np.power(r[-1] - stats["mean"][t], 2) for r in results if r[1] == t) / len(results) for t in tubes } print("Mean quantization error", stats) io_functions.save_json(results, output / "quantization_error.json") io_functions.save_json(stats, output / "quantization_error_mean.json")
def predict( data: utils.URLPath, model: utils.URLPath, output: utils.URLPath, labels: utils.URLPath = None, metrics: bool = True, ): """Generate predictions and plots for a single case. Args: data: SOM dataset. model: Path to model containing CNN and SOMs. output: Destination for plotting. labels: List of case ids to be filtered for generating predictions. """ print(f"Loaded cases from {data}") dataset = som_dataset.SOMDataset.from_path(data) if labels: labels = io_functions.load_json(labels) dataset = dataset.filter(labels=labels) model = classifier.SOMClassifier.load(model) data_sequence = model.create_sequence(dataset, 128) values, pred_labels = model.predict_generator(data_sequence) pred_json = { id: dict(zip(model.config.groups, value.tolist())) for id, value in zip(dataset.labels, values) } io_functions.save_json(pred_json, output / "prediction.json") if metrics: true_labels = data_sequence.true_labels map_config = [("unmapped", { "groups": model.config.groups, "map": {} }), *GROUP_MAPS.items()] for map_name, mapping in map_config: print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(model.config.groups): continue fc_predictions.generate_all_metrics(true_labels, pred_labels, mapping, output / map_name)
def main(args): """Load case ids from json file to filter cases and train and save the created model.""" output_dir = args.output dataset = io_functions.load_case_collection(args.data, args.meta) selected_labels = io_functions.load_json(args.cases) selected, _ = dataset.filter_reasons(labels=selected_labels) if args.tensorboard: tensorboard_dir = output_dir / "tensorboard" else: tensorboard_dir = None model = train_model(selected, markers=args.markers, tensorboard=tensorboard_dir, marker_name_only=args.marker_name_only) io_functions.save_casesom(model, output_dir)
def from_path(cls, path): """ Loads a SOM dataset with the following organization: dataset/ config.json # contains info on used markers meta.json.gz* data/ # contains .npy SOMs dataset.csv* * either csv file with metadata (old format) or a meta.json.gz (casecollection variant, new data) """ config = io_functions.load_json(path / "config.json") try: metadata = io_functions.load_csv(path + ".csv") except FileNotFoundError: metadata = from_case_dataset(path) tubes = list(config.keys()) data_path = path / "data" som_cases = metadata.apply(load_som_cases, axis=1, args=(data_path, tubes)) return cls(data=som_cases, config=config)
def train_model( dataset, markers=None, tensorboard=None, modelargs=None, ) -> sommodels.casesom.CaseSom: """Create and train a SOM model using the given dataset.""" if modelargs is None: modelargs = { "marker_name_only": False, "max_epochs": 10, "batch_size": 50000, "initial_radius": 16, "end_radius": 2, "radius_cooling": "linear", # "marker_images": sommodels.fcssom.MARKER_IMAGES_NAME_ONLY, "map_type": "toroid", "dims": (32, 32, -1), "scaler": "MinMaxScaler", } if markers: selected_markers = io_functions.load_json(markers) else: selected_markers = dataset.selected_markers # modify marker names if marker_name_only if modelargs.get("marker_name_only", False): selected_markers = { tube: [extract_name(marker) for marker in markers] for tube, markers in selected_markers.items() } model = sommodels.casesom.CaseSom( tubes=selected_markers, tensorboard_dir=tensorboard, modelargs=modelargs, ) model.train(dataset) return model
def train_model(dataset, markers=None, size=32, scaler="RefitStandardScaler", tensorboard=None, marker_name_only=False): """Create and train a SOM model using the given dataset""" if markers: selected_markers = io_functions.load_json(markers) else: selected_markers = dataset.selected_markers # modify marker names if marker_name_only if marker_name_only: selected_markers = { tube: [extract_name(marker) for marker in markers] for tube, markers in selected_markers.items() } # scaler = "StandardScaler" # scaler = "RefitStandardScaler" # scaler = "MinMaxScaler" model = sommodels.casesom.CaseSom( tubes=selected_markers, tensorboard_dir=tensorboard, modelargs={ "marker_name_only": marker_name_only, "max_epochs": 10, "batch_size": 50000, "initial_radius": int(size / 2), "end_radius": 2, "radius_cooling": "linear", # "marker_images": sommodels.fcssom.MARKER_IMAGES_NAME_ONLY, "map_type": "toroid", "dims": (size, size, -1), "scaler": scaler, }) model.train(dataset) return model
def json_results(self): print(self.path) return io_functions.load_json(self.path / "preds" / "validation_metrics.json")
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath, epochs: int = 30): """ Args: data: Path to som dataset output: Output path """ tubes = ("1", "2", "3") pad_width = 2 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # mapping = None # groups = mappings.GROUPS # dataset = io_functions.load_case_collection(data, meta) dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=groups) dataset_groups = {d.group for d in dataset} if set(groups) != dataset_groups: raise RuntimeError( f"Group mismatch: {groups}, but got {dataset_groups}") train, validate = dataset.create_split(0.9, stratify=True) group_weights = None # group_count = train.group_count # group_weights = classification_utils.calculate_group_weights(group_count) # group_weights = { # i: group_weights.get(g, 1.0) for i, g in enumerate(groups) # } # train = train.balance(2000) train = train.balance_per_group({ "CM": 6000, # "CLL": 4000, # "MBL": 2000, "MCL": 1000, "PL": 1000, "LPL": 1000, "MZL": 1000, "FL": 1000, "HCL": 1000, "normal": 6000, }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} # always (true, pred) cost_mapping = { ("CLL", "MBL"): 0.5, ("MBL", "CLL"): 0.5, ("MCL", "PL"): 0.5, ("PL", "MCL"): 0.5, ("LPL", "MZL"): 0.5, ("MZL", "LPL"): 0.5, ("CLL", "normal"): 8, ("MBL", "normal"): 8, ("MCL", "normal"): 8, ("PL", "normal"): 8, ("LPL", "normal"): 8, ("MZL", "normal"): 8, ("FL", "normal"): 8, ("HCL", "normal"): 8, } if mapping: cost_mapping = {(mapping.get(a, a), mapping.get(b, b)): v for (a, b), v in cost_mapping.items()} # cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups) # np.save(str(output / "cost_matrix.npy"), cost_matrix) cost_matrix = None config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, "cost_matrix": "cost_matrix.npy" if cost_matrix is not None else None, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-5) if cost_matrix is not None: loss = classification_utils.WeightedCategoricalCrossentropy( cost_matrix) else: loss = "categorical_crossentropy" model.compile( loss=loss, # loss="categorical_crossentropy", # loss="binary_crossentropy", optimizer="adam", # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon), metrics=[ keras.metrics.CategoricalAccuracy(), ]) with (output / "model_summary.txt").open("w") as summary_file: def print_file(*args, **kwargs): print(*args, **kwargs, file=summary_file) model.summary(print_fn=print_file) keras.utils.plot_model(model, to_file=str(output / "model_plot.png")) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence(train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence(validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) # tensorboard_dir = str(output / "tensorboard") # tensorboard_callback = keras.callbacks.TensorBoard( # log_dir=str(tensorboard_dir), # histogram_freq=5, # write_grads=True, # write_images=True, # ) nan_callback = keras.callbacks.TerminateOnNaN() history = model.fit_generator( epochs=epochs, shuffle=True, callbacks=[ # tensorboard_callback, nan_callback ], class_weight=group_weights, generator=trainseq, validation_data=validseq) model.save(str(output / "model.h5")) io_functions.save_joblib(binarizer, output / "binarizer.joblib") preds = [] for pred in model.predict_generator(validseq): preds.append(pred) pred_arr = np.array(preds) pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels generate_all_metrics(true_labels, pred_labels, { "groups": groups, "map": {} }, output / "unmapped") for map_name, mapping in mappings.GROUP_MAPS.items(): output_path = output / map_name # skip if more groups in map print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(groups): continue generate_all_metrics(true_labels, pred_labels, mapping, output_path) plot_training_history(history, output / "training.png")
def main(data: utils.URLPath, output: utils.URLPath, model_name: str, modelargs: json.loads, epochs: int = 30): """ Args: data: Path to som dataset output: Output path """ tubes = ("1", "2", "3") pad_width = 0 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # mapping = None # groups = mappings.GROUPS dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=groups) dataset_groups = {d.group for d in dataset} if set(groups) != dataset_groups: raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}") train, validate = dataset.create_split(0.9, stratify=True) # train = train.balance(20) train = train.balance_per_group({ "CM": 6000, # "CLL": 4000, # "MBL": 2000, "MCL": 1000, "PL": 1000, "LPL": 1000, "MZL": 1000, "FL": 1000, "HCL": 1000, "normal": 6000, }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, "cost_matrix": None, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) # binarizer, model = get_model(selected_tubes, groups=groups, n_neighbors=1) binarizer, model = get_model(selected_tubes, groups=groups, model_name="RandomForest", **modelargs) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence( train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence( validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) xdata, ydata = sequence_to_array(trainseq) model.fit(xdata, ydata) xtest, ytest = sequence_to_array(validseq) pred_arr = model.predict(xtest) io_functions.save_joblib(binarizer, output / "binarizer.joblib") pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels generate_all_metrics( true_labels, pred_labels, {"groups": groups, "map": {}}, output / "unmapped") for map_name, mapping in mappings.GROUP_MAPS.items(): output_path = output / map_name # skip if more groups in map print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(groups): continue generate_all_metrics(true_labels, pred_labels, mapping, output_path)
def main(args): dataset = som_dataset.SOMDataset.from_path(args.input) val = args.val train = args.train OUTPUT = args.output groups = ["MCL", "PL"] tubes = ("1") mapping = None balance = { "MCL": 20, "PL": 20, } config = classifier.SOMClassifierConfig( **{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, }) val = io_functions.load_json(val) validate_dataset = dataset.filter(labels=val) labels = io_functions.load_json(train) train_dataset = dataset.filter(labels=labels) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( train_dataset, split_ratio=0.9, groups=groups, mapping=mapping, balance=balance, val_dataset=validate_dataset) print(train_dataset.group_count) print(validate_dataset.group_count) binarizer, model = load_model(args.model) trainseq = som_dataset.SOMSequence(train_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) validseq = som_dataset.SOMSequence(validate_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) model.fit_generator(generator=trainseq, epochs=10, validation_data=validseq) args.output.mkdir(parents=True, exist_ok=True) io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib") model.save(str(args.output / "model.h5")) io_functions.save_json(config.to_json(), OUTPUT / "config.json") io_functions.save_json(validseq.dataset.labels, OUTPUT / "ids_validate.json") io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath): """ Args: data: Path to som dataset output: Output path """ tubes = ("2", "3", "4") pad_width = 1 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # dataset = io_functions.load_case_collection(data, meta) dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=[g for g in groups if g not in ("LPL", "MZL")]) dataset_groups = {d.group for d in dataset} # if set(groups) != dataset_groups: # raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}") validate, train = dataset.create_split(10, stratify=True) group_count = train.group_count num_cases = sum(group_count.values()) balanced_nums = num_cases / len(dataset_groups) balanced_loss_weights = [balanced_nums / group_count.get(g, balanced_nums) for g in groups] min_ratio = min(balanced_loss_weights) balanced_loss_weights = {i: v / min_ratio for i, v in enumerate(balanced_loss_weights)} print(balanced_loss_weights) # train = train.balance(2000) # train = train.balance_per_group({ # "CM": 6000, # # "CLL": 4000, # # "MBL": 2000, # "MCL": 1000, # "PL": 1000, # "LPL": 1000, # "MZL": 1000, # "FL": 1000, # "HCL": 1000, # "normal": 6000, # }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-7) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence( train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence( validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) tensorboard_dir = str(output / "tensorboard") tensorboard_callback = keras.callbacks.TensorBoard( log_dir=str(tensorboard_dir), histogram_freq=5, write_grads=True, write_images=True, ) nan_callback = keras.callbacks.TerminateOnNaN() model.fit_generator( epochs=15, shuffle=True, callbacks=[tensorboard_callback, nan_callback], class_weight=balanced_loss_weights, generator=trainseq, validation_data=validseq) model.save(str(output / "model.h5")) io_functions.save_joblib(binarizer, output / "binarizer.joblib") preds = [] for pred in model.predict_generator(validseq): preds.append(pred) pred_arr = np.array(preds) pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels confusion = metrics.confusion_matrix(true_labels, pred_labels, labels=groups) print(groups) print(confusion) balanced = metrics.balanced_accuracy_score(true_labels, pred_labels) print(balanced)
import matplotlib as mpl mpl.use("Agg") import matplotlib.pyplot as plt import seaborn as sns sns.set() import pandas as pd from flowcat import utils, io_functions input_data = { p.name: io_functions.load_json(p / "quantization_error.json") for p in map(utils.URLPath, [ "output/4-flowsom-cmp/quantization-error/flowsom-10", "output/4-flowsom-cmp/quantization-error/flowcat-refit-s10", "output/4-flowsom-cmp/quantization-error/flowsom-32", "output/4-flowsom-cmp/quantization-error/flowcat-refit-s32", ]) } input_data = [{ "dataset": k, "id": label, "tube": tube, "qe": value, "algo": k.split("-")[0], "size": int(k.split("-")[-1].lstrip("s")), } for k, vv in input_data.items() for label, tube, value in vv] data = pd.DataFrame(input_data) sns.set_style("white")
for marker in markers: marker_names.append( Marker(antibody=Marker.name_to_marker(marker).antibody, color=None)) selected_markers = {"1": marker_names} return selected_markers dataset = io_functions.load_case_collection( utils.URLPath("/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F"), utils.URLPath( "/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F_meta/train.json.gz" )) references = io_functions.load_json( utils.URLPath( "/data/flowcat-data/2020_Nov_rerun/Merged_Files/MLL9F_meta/references.json" )) OUTPUT = utils.URLPath("/data/flowcat-data/2020_Nov_rerun/Merged_SOM/MLL9F") setup_logging(None, "generate ref SOM for merged FCS") ref_dataset = dataset.filter(labels=references) tensorboard_dir = None # Discover channels in the given dataset markers = get_tube_marker(ref_dataset) # markers = read_sel_markers(sel_markers) print(markers)
def main(args): dataset = som_dataset.SOMDataset.from_path(args.input) val = args.val train = args.train OUTPUT = args.output PANEL = args.panel basemodel = args.basemodel bal = args.bal # set the groups according to the panel if PANEL == "MLL": groups = GROUPS elif PANEL == "ERLANGEN": groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] else: groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] tubes = ("1") mapping = None balance = dict((key, bal) for key in groups) config = classifier.SOMClassifierConfig( **{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, "train_epochs": 15, }) val = io_functions.load_json(val) validate_dataset = dataset.filter(labels=val) labels = io_functions.load_json(train) train_dataset = dataset.filter(labels=labels) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( train_dataset, split_ratio=0.9, groups=groups, mapping=mapping, balance=balance, val_dataset=validate_dataset) print(train_dataset.group_count) print(validate_dataset.group_count) # load base model and get weights base_model = models.load_model(str(basemodel / "model.h5")) weights = base_model.get_weights() # create model model = create_model(config.inputs, config.output) model.set_weights(weights) # freeze 2 dense layers: check for each dataset model.get_layer('dense_1').trainable = False model.get_layer('dense_2').trainable = False model.compile(loss=config.get_loss(modeldir=None), optimizer="adam", metrics=["accuracy"]) # cast to SOMConfig instance model = SOMClassifier(config, model) train = model.create_sequence(train_dataset, config.train_batch_size) if validate_dataset is not None: validate = model.create_sequence(validate_dataset, config.valid_batch_size) else: validate = None model.train_generator(train, validate, epochs=config.train_epochs, class_weight=None) model.save(OUTPUT) model.save_information(OUTPUT)
# flake8: noqa """Create plots for channel occlusion data.""" import numpy as np import matplotlib as mpl mpl.use("Agg") import matplotlib.pyplot as plt import seaborn as sns sns.set() sns.set_style("white") from flowcat import io_functions, utils, mappings data = utils.URLPath("output/0-final/model-analysis/occlusion") output = utils.URLPath("output/0-final/model-analysis/occlusion/plots") output.mkdir() group_data = [(p.name.split("_")[0], io_functions.load_json(p)) for p in data.glob("*.json")] group_tubes = [ (group, tube, np.mean([t[2] for t in gdata if t[0] == tube]), np.sqrt(np.mean([np.power(t[3], 2) for t in gdata if t[0] == tube]))) for group, gdata in group_data for tube in ("1", "2", "3") ] colors = sns.color_palette("Blues") pos = np.arange(len(mappings.GROUPS)) fig, ax = plt.subplots() ax.bar( [ pos[mappings.GROUPS.index(g)] + (int(t) - 2) * 0.2
def load_somclassifier_config(path: utils.URLPath) -> "SOMClassifierConfig": """Load somclassifier config from the given path.""" return SOMClassifierConfig(**io_functions.load_json(path))
def plot_channel_densities(tube: str, channels: List[str], output: utils.URLPath): """Plot the channel densities for a given dataset. Args: tube: Tube for which intensities should be generated. channels: List of channels used in generation. output: Output directory of plots. """ berlin_dataset, munich_dataset = load_datasets() # berlin_sample = berlin_dataset.sample(10) # groups = list(berlin_sample.group_count.keys()) # munich_sample = munich_dataset.sample(10, groups=groups) output = utils.URLPath("output/50-berlin_dataset/plot_channel_densities") # io_functions.save_json(berlin_sample.labels, output / "berlin_sample_labels.json") # io_functions.save_json(munich_sample.labels, output / "munich_sample_labels.json") berlin_sample_ids = io_functions.load_json(output / "berlin_sample_labels.json") munich_sample_ids = io_functions.load_json(output / "munich_sample_labels.json") berlin_sample = berlin_dataset.filter(labels=berlin_sample_ids) munich_sample = munich_dataset.filter(labels=munich_sample_ids) berlin_markers = berlin_sample.selected_markers from collections import defaultdict # find best match for each munich tube for tube, markers in munich_sample.selected_markers.items(): counts = defaultdict(int) for btube, bmarkers in berlin_markers.items(): bmarkers_name_only = [m.split()[0] for m in bmarkers] for marker in markers: marker = marker.replace("-", " ") mname = marker.split()[0] if marker in bmarkers: print(btube, marker) counts[btube] += 1 elif mname in bmarkers_name_only: print(btube, mname) print(counts) tube = "1" channels = ("CD45-KrOr", "SS INT LIN") output = utils.URLPath( "output/50-berlin_dataset/plot_channel_densities/plots") output.mkdir() sns.set_style("white") # create hex bin plot for name, dataset in (("berlin", berlin_sample), ("munich", munich_sample)): for group in ("normal", "MCL", "CLL", "FL", "LPL", "MZL", "HCL"): datas_x, datas_y = data_to_channels(dataset, tube, channels) data_x = pd.concat(datas_x).reset_index(drop=True, inplace=False) data_y = pd.concat(datas_y).reset_index(drop=True, inplace=False) plt.figure() sns.jointplot(data_x, data_y, kind="hex") plt.savefig(str(output / f"hex_{name}_{group}.png")) plt.close("all") # create kde plot in one dimension group = "CLL" channel = "CD19 ECD" btubes = ("2", "3", "4") berlin_gsample = [c for c in berlin_sample if c.group == group] berlin_ts = [(tube, pd.concat(data_to_channel(berlin_gsample, tube, channel)).reset_index(drop=True)) for tube in btubes] mchannel = "CD19-APCA750" mtubes = ("1", "2", "3") munich_gsample = [c for c in munich_sample if c.group == group] munich_ts = [(tube, pd.concat(data_to_channel(munich_gsample, tube, mchannel)).reset_index(drop=True)) for tube in mtubes] fig, ax = plt.subplots() for tube, berlin_t in berlin_ts: sns.kdeplot(berlin_t, ax=ax, color="blue", label=f"Berlin {tube}") for tube, munich_t in munich_ts: sns.kdeplot(munich_t, ax=ax, color="red", label=f"Munich {tube}") fig.suptitle(f"{group} {channel} {mchannel}") fig.savefig(str(output / f"kde_munich_berlin_{group}_CD19.png")) # create kde plot after rescaling group = "CLL" channel = "Kappa FITC" btubes = ("2") berlin_gsample = [c for c in berlin_sample if c.group == group] berlin_ts = [] for tube in btubes: datas = data_to_channel(berlin_gsample, tube, channel) transformed = [] for data in datas: data = data.values data = data.reshape(-1, 1).astype("float32") tf = preprocessing.StandardScaler().fit_transform(data) transformed.append(tf.flatten()) merged = np.concatenate(transformed) berlin_ts.append((tube, merged)) mchannel = "Kappa-FITC" mtubes = ("2") munich_gsample = [c for c in munich_sample if c.group == group] munich_ts = [] for tube in mtubes: datas = data_to_channel(munich_gsample, tube, mchannel) transformed = [] for data in datas: data = data.values data = data.reshape(-1, 1).astype("float32") tf = preprocessing.StandardScaler().fit_transform(data) transformed.append(tf.flatten()) merged = np.concatenate(transformed) munich_ts.append((tube, merged)) fig, ax = plt.subplots() for tube, berlin_t in berlin_ts: sns.kdeplot(berlin_t, ax=ax, color="blue", label=f"Berlin {tube}") for tube, munich_t in munich_ts: sns.kdeplot(munich_t, ax=ax, color="red", label=f"Munich {tube}") fig.suptitle(f"{group} {channel} {mchannel}") fig.savefig(str(output / f"standard_kde_munich_berlin_{group}_kappa.png"))