def main(): # dataset = io_functions.load_case_collection( # utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"), # utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/test.json.gz") # ) dataset = io_functions.load_case_collection( utils.URLPath("/data/flowcat-data/paper-cytometry/unused-data"), ) LOGGER.info("Anonymizing dataset: %s", dataset) OUTPUT = utils.URLPath( "/data/flowcat-data/paper-cytometry-resubmit/unused_data_anonymized") data_dir = OUTPUT / "data" data_dir.mkdir() for case in dataset: # if case.id != "ffc59330acb49e6fcf5e679dbabcd01e56991345": # continue for sample in case.samples: old_path = sample.complete_path new_path = data_dir / sample.path LOGGER.info("Saving %s sample to %s", case.id, new_path) new_path.parent.mkdir() anon_move(str(old_path), str(new_path))
def test_concatenation(self): cases = [("a", "b", "a/b"), ("/c", "d", "/c/d"), ("file:///a", "telnet", "file:///a/telnet")] for part_a, part_b, expected in cases: url_a = utils.URLPath(part_a) url_b = utils.URLPath(part_b) result = url_a / url_b self.assertEqual(str(result), expected)
def main(): output = utils.URLPath("output/4-flowsom-cmp/retrain_figures") output.mkdir() data = utils.URLPath("output/4-flowsom-cmp/retrain_tests_32_learning_rate") # data = utils.URLPath("output/4-flowsom-cmp/retrain_tests_32_radius") datasets = load_datasets(data) groups = mappings.GROUPS tube = "1" group = "CLL" joined_datasets = merged_data(datasets, group, tube) # plot_hexplot_datasets(joined_datasets, ("CD45-KrOr", "SS INT LIN"), output / "radius_cll_cd_45_ss.png") plot_hexplot_datasets(joined_datasets, ("CD20-PC7", "CD5-PacBlue"), output / "learn_rate_cd20_cd5.png")
def run_transfer(options, train_dataset, validate_dataset): config = options["config"] base_model = models.load_model(options["base_model_path"]) tl_model = create_tl_model(base_model, config) model = SOMClassifier(config, tl_model) train = model.create_sequence(train_dataset, config.train_batch_size) if validate_dataset is not None: validate = model.create_sequence(validate_dataset, config.valid_batch_size) else: validate = None model.train_generator(train, validate, epochs=config.train_epochs, class_weight=None) output = utils.URLPath(options["output_path"]) if validate: pred_arr, pred_labels = model.predict_generator(validate) true_labels = validate.true_labels pred_df = pd.DataFrame(pred_arr, columns=validate.binarizer.classes_, index=validate.dataset.labels) io_functions.save_csv(pred_df, output / "preds.csv") io_functions.save_json({"true": list(true_labels), "pred": list(pred_labels)}, output / "preds_labels.json") generate_all_metrics(true_labels, pred_labels, config.mapping, output) model.save(output) model.save_information(output) keras.backend.clear_session() del model
def sampleinfo_to_sample(sample_info: dict, case_id: str, dataset_path: utils.URLPath) -> "Sample": """Create a tube sample from sample info dict.""" assert "fcs" in sample_info and "path" in sample_info[ "fcs"], "Path to sample_info is missing" assert "date" in sample_info, "Date is missing" path = utils.URLPath(sample_info["fcs"]["path"]) date = utils.str_to_date(sample_info["date"]) tube = str(sample_info.get("tube", "0")) material = Material.from_str(sample_info.get("material", "")) panel = sample_info.get("panel", "") markers = sample_info["fcs"].get("markers", None) count = int(sample_info["fcs"].get("event_count", 0)) or None sample_id = f"{case_id}_t{tube}_{material.name}_{sample_info['date']}" sample = FCSSample(id=sample_id, case_id=case_id, path=path, dataset_path=dataset_path, date=date, tube=tube, material=material, panel=panel, markers=markers, count=count) return sample
def main( data: utils.URLPath = None, model: utils.URLPath = None, preds: utils.URLPath = None, output: utils.URLPath = None, ): data = utils.URLPath("/data/flowcat-data/paper-cytometry/som/unused") dataset = io_functions.load_case_collection(data, data + ".json.gz") # output = utils.URLPath("/data/flowcat-data/paper-cytometry/tsne") output = utils.URLPath("teststuff_unused_style") output.mkdir() # predictions = io_functions.load_json(utils.URLPath("/data/flowcat-data/paper-cytometry/tsne/prediction.json")) model = SOMClassifier.load(utils.URLPath("/data/flowcat-data/paper-cytometry/classifier")) som_tsne(dataset, model, output)
def main(data: utils.URLPath, kfold_dir: utils.URLPath, output: utils.URLPath): # dataset = io_functions.load_case_collection(data, meta) # dataset.set_data_path(utils.URLPath("")) dataset = som_dataset.SOMDataset.from_path(data) models = [] dirs = next(os.walk(kfold_dir))[1] for dir in dirs: models.append(utils.URLPath(os.path.join(kfold_dir, dir))) aucs = [] curves = [] for i, model in enumerate(models): print(model) model = SOMClassifier.load(model) validate = model.get_validation_data(dataset) grps = validate.group_count groups = model.config.groups if len(grps.keys()) != len(groups): continue else: val_seq = model.create_sequence(validate) trues = np.concatenate([val_seq[i][1] for i in range(len(val_seq))]) preds = np.array([p for p in model.model.predict_generator(val_seq)]) auc, curve = create_roc_results(trues, preds, output / f"roc_n{i}", model) aucs.append(auc) curves.append(curve) compute_mean_ROC(curves, output)
def load(cls, path: str = None, ref_path: str = None, cls_path: str = None): """Load classifier from the given path, alternatively give a separate path for reference and classifier.""" if path is not None: ref_path = utils.URLPath(path) / "reference" cls_path = utils.URLPath(path) / "classifier" elif ref_path is not None and cls_path is not None: ref_path = utils.URLPath(ref_path) cls_path = utils.URLPath(cls_path) else: raise ValueError( "Either path or ref_path and cls_path need to be set.") return cls(io_functions.load_casesom(ref_path), SOMClassifier.load(cls_path), SOMSaliency.load(cls_path))
def json_to_fcssample(samplejson: dict) -> "FCSSample": samplejson["date"] = utils.str_to_date(samplejson["date"]) samplejson["path"] = utils.URLPath(samplejson["path"]) if samplejson["material"]: samplejson["material"] = Material[samplejson["material"]] else: samplejson["material"] = None return FCSSample(**samplejson)
def test_addition(self): cases = [ ("testfile", "as", "testfileas"), ("/a/", "test", "/atest"), # trailing slashes will get removed on creation ("/file", ".lmd", "/file.lmd"), ] for part_a, part_b, expected in cases: result = utils.URLPath(part_a) + part_b self.assertEqual(str(result), expected)
def test_urls(self): cases = [ ("a", "", ""), ("https://a", "https", "a"), ("https://dest.de/a", "https", "dest.de"), ] for url, scheme, netloc in cases: result = utils.URLPath(url) self.assertEqual(result._scheme, scheme) self.assertEqual(result._netloc, netloc)
def reconfigure_som_model(som_model: CaseSom, args: dict) -> CaseSom: """Reconfigure SOM by saving a copy and loading it again.""" tmp_path = utils.URLPath("/tmp/flowcat/sommodel") io_functions.save_casesom(som_model, tmp_path) reconfigured_model = io_functions.load_casesom(tmp_path, **args) rmtree(str(tmp_path)) return reconfigured_model
def test_wrapping(self): @cast_urlpath def testfun(a: utils.URLPath = None, b: str = None): return a, b res = testfun(utils.URLPath("a"), "b") self.assertEqual(type(res[0]), utils.URLPath) self.assertNotEqual(type(res[1]), utils.URLPath) res = testfun("a", "b") self.assertEqual(type(res[0]), utils.URLPath) self.assertNotEqual(type(res[1]), utils.URLPath) res = testfun("a", b="b") self.assertEqual(type(res[0]), utils.URLPath) self.assertNotEqual(type(res[1]), utils.URLPath) res = testfun(a=utils.URLPath("b"), b="a") self.assertEqual(type(res[0]), utils.URLPath) self.assertNotEqual(type(res[1]), utils.URLPath) res = testfun(a="b", b="a") self.assertEqual(type(res[0]), utils.URLPath) self.assertNotEqual(type(res[1]), utils.URLPath)
def run_denovo(options, train_dataset, validate_dataset): config = options["config"] model = train_som_classifier(train_dataset, validate_dataset, config) output = utils.URLPath(options["output_path"]) if validate_dataset: validate = model.create_sequence(validate_dataset, config.valid_batch_size) pred_arr, pred_labels = model.predict_generator(validate) true_labels = validate.true_labels pred_df = pd.DataFrame(pred_arr, columns=validate.binarizer.classes_, index=validate.dataset.labels) io_functions.save_csv(pred_df, output / "preds.csv") io_functions.save_json({"true": list(true_labels), "pred": list(pred_labels)}, output / "preds_labels.json") generate_all_metrics(true_labels, pred_labels, config.mapping, output) model.save(output) model.save_information(output) keras.backend.clear_session() del model
""" from dataclasses import dataclass import pandas as pd import matplotlib as mpl mpl.use("Agg") import matplotlib.pyplot as plt import seaborn as sns import scipy.stats as sst from flowcat import utils, io_functions NAME = "result_analysis_removeedge" RESULTS = { "path": utils.URLPath("output"), "names": ["classifier_ungated", "classifier_gated_removeedge"], } OUTPUT = utils.URLPath(f"output/{NAME}") LOGGER = utils.setup_logging(utils.URLPath(f"logs/{NAME}_{utils.create_stamp()}"), NAME) def get_result_dirs(path: utils.URLPath, names: list): """Get result directories for individual iterations from given path and names""" result_dirs = { name: Metrics(list(map(Result, path.glob(f"./{name}*")))) for name in names } return result_dirs
import matplotlib matplotlib.use("Agg") from flowcat import io_functions, utils from flowcat.plots import som as fc_somplot LOGPATH = utils.URLPath("logs/visualize_datasets_{utils.create_stamp()}.log") LOGGER = utils.logs.setup_logging(LOGPATH, "visualize_datasets") OUTPUT = utils.URLPath("output/visualization/soms-ungated") # OUTPUT.mkdir() # # som_dataset = io_functions.load_case_collection(utils.URLPath("output/classifier_ungated/som")) # # # testsample = som_dataset[0].samples[0] # # for case in som_dataset.filter(groups=["CLL"]): # testsample = case.get_tube("1", kind="som") # LOGGER.info(testsample) # somdata = testsample.get_data() # fig = fc_somplot.plot_som_grid(somdata, channels=["SS INT LIN", "CD45-KrOr", None]) # fig.savefig(str(OUTPUT / f"test_{case.id}.png")) OUTPUT = utils.URLPath("output/visualization/soms-original") som_dataset = io_functions.load_case_collection(utils.URLPath("/data/flowcat-data/paper-cytometry/som/train"), utils.URLPath("/data/flowcat-data/paper-cytometry/som/train.json.gz")) OUTPUT.mkdir() for case in som_dataset.filter(groups=["CLL"]): testsample = case.get_tube("1", kind="som") LOGGER.info(testsample) somdata = testsample.get_data()
def print_usage(): """print syntax of script invocation""" print("\nUsage:") print("python {0:} SOM_datapath outputpath panel(Erlangen, Bonn, MLL," "or Berlin)\n".format(os.path.basename(sys.argv[0]))) return if __name__ == "__main__": if len(sys.argv) != 5: print_usage() raise Exception("Invalid arguments") SOM_DATASET = utils.URLPath(sys.argv[1]) OUTPUT = utils.URLPath(sys.argv[2]) PANEL = sys.argv[3] EPOCHS = int(sys.argv[4]) LOGGER = utils.logs.setup_logging(None, "merged model") # set the groups according to the panel if panel == "MLL": groups = GROUPS elif panel == "ERLANGEN": groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] else: groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"]
def main(data: utils.URLPath, model: utils.URLPath, output: utils.URLPath): dataset = io_functions.load_case_collection(data, data + ".json") dataset.set_data_path(utils.URLPath("")) model = SOMClassifier.load(model) validate = model.get_validation_data(dataset) val_seq = model.create_sequence(validate) trues = np.concatenate([val_seq[i][1] for i in range(len(val_seq))]) preds = np.array([p for p in model.model.predict_generator(val_seq)]) create_roc_results(trues, preds, output / "roc", model) create_threshold_results(trues, preds, output / "threshold", model) # tsne of result vectors embedding_path = output / "embedding-preds" embedding_path.mkdir() pred_labels = val_seq.true_labels groups = model.config["groups"] groups.remove("normal") groups = ["normal", *groups] all_groups = groups + ["AML", "MM", "HCLv"] colors = sns.cubehelix_palette(len(all_groups), rot=4, dark=0.30) perplexity = 50 # tsne of intermediate layers intermediate_model = keras.Model( inputs=model.model.input, outputs=model.model.get_layer("concatenate_1").output) intermed_preds = np.array( [p for p in intermediate_model.predict_generator(val_seq)]) # unknown data udata = utils.URLPath("output/unknown-cohorts-processing/som/som") udataset = io_functions.load_case_collection(udata, udata + ".json") udataset.set_data_path(utils.URLPath("")) un_seq = model.create_sequence(udataset) intermed_upreds = np.array( [p for p in intermediate_model.predict_generator(un_seq)]) all_intermed = np.concatenate((intermed_preds, intermed_upreds)) all_labels = pred_labels + un_seq.true_labels umap_inter_all = UMAP(n_neighbors=30).fit_transform(all_intermed) plot_embedded(umap_inter_all, all_labels, all_groups, colors=colors).savefig(str(embedding_path / f"umap_intermediate_all.png"), dpi=300) tsne_inter_all = manifold.TSNE( perplexity=perplexity).fit_transform(all_intermed) plot_embedded( tsne_inter_all, all_labels, all_groups, colors=colors).savefig(str( embedding_path / f"tsne_intermediate_all_p{perplexity}.png"), dpi=300) # create som tsne for known and unknown data all_cases = validate.cases + udataset.cases case_data = [] for case in all_cases: somdata = np.concatenate([ case.get_tube(tube, kind="som").get_data().data for tube in model.config["tubes"] ], axis=2).flatten() case_data.append(somdata) case_data = np.array(case_data) perplexity = 50 umap_som_all = UMAP(n_neighbors=30).fit_transform(case_data) plot_embedded(umap_som_all, all_labels, all_groups, colors=colors).savefig( str(embedding_path / f"umap_som_all.png"), dpi=300) tsne_som_all = manifold.TSNE( perplexity=perplexity).fit_transform(case_data) plot_embedded(tsne_som_all, all_labels, all_groups, colors=colors).savefig( str(embedding_path / f"tsne_som_all_p{perplexity}.png"), dpi=300) # plot legend fig = plt.figure() patches = [ mpl.patches.Patch(color=color, label=group) for group, color in zip(all_groups, colors) ] fig.legend(patches, all_groups, loc='center', frameon=False) fig.savefig(str(embedding_path / "legend.png"), dpi=300)
import math def create_class_weight(labels_dict, mu=0.15): total = np.sum(list(labels_dict.values())) keys = labels_dict.keys() class_weight = dict() for key in keys: score = math.log(mu * total / float(labels_dict[key])) class_weight[key] = score if score > 1.0 else 1.0 return class_weight SOM_DATASET = utils.URLPath("/data/flowcat-data/2020-04_merged_train/MLL5F") OUTPUT = utils.URLPath( "/data/flowcat-data/2020-04_merged_train/TL/class_weights/mu_30/model_30") def create_class_weight(labels_dict, mu=0.30): total = np.sum(list(labels_dict.values())) keys = labels_dict.keys() class_weight = dict() for key in keys: score = math.log(mu * total / float(labels_dict[key])) class_weight[key] = score if score > 1.0 else 1.0 return class_weight
def save(self, path: str): """Save the current model into the given path.""" path = utils.URLPath(path) path.mkdir() io_functions.save_casesom(self.reference, path / "reference") self.classifier.save(path / path / "classifier")
def json_to_somsample(samplejson: dict) -> "SOMSample": samplejson["date"] = utils.str_to_date(samplejson["date"]) samplejson["path"] = utils.URLPath(samplejson["path"]) samplejson["dims"] = tuple(samplejson["dims"]) return SOMSample(**samplejson)
model : base - 9F all CLL and normal samples target - 5F - increasing sample size (start with very few samples) 2) groups - only rare subtypes ( no CLL, MBl, normal) model : base - 9F all rare subtypes samples target - 5F - increasing sample size (start with very few samples) """ from flowcat import classifier, utils, io_functions from flowcat.constants import DEFAULT_CLASSIFIER_CONFIG, GROUPS, DEFAULT_CLASSIFIER_ARGS from flowcat import flowcat_api as fc_api from flowcat.classifier import som_dataset from flowcat.classifier.models import create_model_multi_input #MARKERS = io_functions.load_json(utils.URLPath("/data/flowcat-data/2020-04_merged_train/MLL9F/markers.json")) SOM_DATASET = utils.URLPath("/data/flowcat-data/2020-04_merged_train/MLL9F") OUTPUT = utils.URLPath("/data/flowcat-data/2020-04_merged_train/MLL9F/Exp1") LOGGER = utils.logs.setup_logging(None, "classify") groups = ["MCL", "PL", "LPL", "MZL", "FL", "HCL"] tubes = ("1") mapping = None dataset = som_dataset.SOMDataset.from_path(SOM_DATASET) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( dataset, split_ratio=0.9, groups=groups, mapping=mapping, balance=None) config = classifier.SOMClassifierConfig( **{ "tubes": {tube: dataset.config[tube]
def main(data: utils.URLPath, meta: utils.URLPath, reference: utils.URLPath, model: utils.URLPath): data, meta, soms, model = map(utils.URLPath, [ "/data/flowcat-data/mll-flowdata/decCLL-9F", "output/0-final-dataset/train.json.gz", "output/som-fix-test/soms-test/som_r4_1", "output/0-final/classifier-minmax-new", ]) dataset = io_functions.load_case_collection(data, meta) soms = som_dataset.SOMDataset.from_path(soms) model = SaliencySOMClassifier.load(model) val_dataset = model.get_validation_data(dataset) val_seq = model.create_sequence(soms) # printing out weights and biases, unsure whether they actually contain # information # in theory we could extend that to attempt to describe them as gates tube = "3" weights, biases = model.model.layers[int(tube) + 2].get_weights() for j, chname in enumerate(model.config["tubes"][tube]["channels"]): ch_mean_weight = np.mean(weights[:, :, j, :]) print(j, chname, ch_mean_weight) for i in range(weights.shape[-1]): mean_weight = np.mean(weights[:, :, :, i]) print(i, mean_weight, biases[i]) for j, chname in enumerate(model.config["tubes"]["1"]["channels"]): print(i, j, chname) print(weights[:, :, j, i]) # zero out specific columns and see how that impacts performance output = utils.URLPath("output/0-final/model-analysis/occlusion") for group in model.config["groups"]: print(group) sel_cases = val_dataset.filter(groups=[group]) avg_results = model.channel_occlusion(sel_cases, val_seq) print(sorted(avg_results, key=lambda t: t[2], reverse=True)) io_functions.save_json(avg_results, output / f"{group}_avg_std.json") # case_som = soms.get_labels([case.id]).iloc[0] hcls = val_dataset.filter(groups=["HCL"]) from collections import defaultdict max_vals = defaultdict(lambda: defaultdict(list)) mean_vals = defaultdict(lambda: defaultdict(list)) for case in hcls: print(case) gradient = model.calculate_saliency(val_seq, case, case.group, maximization=False) for i, (tube, markers) in enumerate(model.config["tubes"].items()): tgrad = gradient[i] for i, marker in enumerate(markers["channels"]): mgrad = tgrad[:, :, i] gmax = np.max(mgrad) max_vals[tube][marker].append(gmax) gmean = np.mean(mgrad) mean_vals[tube][marker].append(gmean) max_markers = defaultdict(list) for tube, markers in model.config["tubes"].items(): for marker in markers["channels"]: print("Max", tube, marker, np.mean(max_vals[tube][marker])) print("Mean", tube, marker, np.mean(mean_vals[tube][marker])) max_markers[tube].append((marker, np.mean(max_vals[tube][marker]))) for tube in model.config["tubes"]: print("Tube", tube) print("\n".join(": ".join((t[0], str(t[1]))) for t in sorted( max_markers[tube], key=lambda t: t[1], reverse=True))) c_model = MLLDATA / "mll-sommaps/models/relunet_samplescaled_sommap_6class/model_0.h5" c_labels = MLLDATA / "mll-sommaps/output/relunet_samplescaled_sommap_6class/test_labels.json" c_preds = MLLDATA / "mll-sommaps/models/relunet_samplescaled_sommap_6class/predictions_0.csv" c_config = MLLDATA / "mll-sommaps/output/relunet_samplescaled_sommap_6class/config.json" c_cases = MLLDATA / "mll-flowdata/CLL-9F" c_sommaps = MLLDATA / "mll-sommaps/sample_maps/selected1_toroid_s32" c_misclass = MLLDATA / "mll-sommaps/misclassifications/" c_tube = [1, 2] # load datasets somdataset = sd.SOMDataset.from_path(c_sommaps) cases = cc.CaseCollection.from_path(c_cases, how="case_info.json") # filter datasets test_labels = flowutils.load_json(c_labels) filtered_cases = cases.filter(labels=test_labels) somdataset.data[1] = somdataset.data[1].loc[test_labels, :] # get mapping config = flowutils.load_json(c_config) groupinfo = mappings.GROUP_MAPS[config["c_groupmap"]] dataset = cd.CombinedDataset(filtered_cases, { dd.Dataset.from_str('SOM'): somdataset, dd.Dataset.from_str('FCS'): filtered_cases }, group_names=groupinfo['groups']) # modify mapping dataset.set_mapping(groupinfo) xoutputs = [ loaders.loader_builder( loaders.Map2DLoader.create_inferred, tube=1, sel_count="counts", pad_width=1, ), loaders.loader_builder( loaders.Map2DLoader.create_inferred, tube=2, sel_count="counts", pad_width=1, ) ] dataset = loaders.DatasetSequence.from_data(dataset, xoutputs, batch_size=1, draw_method="sequential") predictions = pd.read_csv(c_preds, index_col=0) predictions = add_correct_magnitude(predictions) predictions = add_infiltration(predictions, cases) misclass_labels = ['507777582649cbed8dfb3fe552a6f34f8b6c28e3'] for label in misclass_labels: label_path = pathlib.Path(f"{c_misclass}/{label}") if not label_path.exists(): label_path.mkdir() case = cases.get_label(label) #get the actual and the predicited class corr_group = predictions.loc[case.id, "correct"] pred_group = predictions.loc[case.id, "pred"] classes = [corr_group, pred_group] gradients = plotting.calc_saliency(dataset, case, c_model, classes=classes) for tube in c_tube: heatmaps = plotting.draw_saliency_heatmap(case, gradients, classes, tube) for idx, heatmap in enumerate(heatmaps): plotting.save_figure( heatmap, f"{c_misclass}/{label}/{classes[idx]}_tube_{tube}_saliency_heatmap.png" ) scatterplots = plotting.plot_tube(case, tube, gradients[tube - 1], classes=classes, sommappath=c_sommaps) for idx, scatterplot in enumerate(scatterplots): plotting.save_figure( scatterplot, f"{c_misclass}/{label}/{classes[idx]}_tube_{tube}_scatterplots.png" )
"base_model_path": str(base_model_path / "model.h5"), "output_path": output_path / f"kfold_n{n}", "config": classifier.SOMClassifierConfig(**{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, "train_epochs": 15, }) } run_transfer(options, train_dataset, validate_dataset) if __name__ == "__main__": OUTPUT = utils.URLPath("/data/flowcat-data/2021-01_kfold_n10_startified") LOGGER = utils.logs.setup_logging(OUTPUT / "logs.txt", "merged model with TL") experiments = { "mll5f": { "output_path": OUTPUT / "mll5f", "som_dataset_path": "/data/flowcat-data/2020_Nov_rerun/Merged_SOM/MLL5F", "panel": "MLL", "base_model_path": "/data/flowcat-data/2020_Nov_rerun/Merged_model/MLL9F", "k_number": 10, "rerun": False, "stratified": False, }, "bonn": { "output_path": OUTPUT / "bonn", "som_dataset_path": "/data/flowcat-data/2020_Nov_rerun/Merged_SOM/Bonn/with_9F_ref", "panel": "BONN",
utils.logs.create_handler(utils.logs.print_stream()), ] def setup_logging(logging_path, name): logging_path.parent.mkdir() logger = logging.getLogger(name) handlers = create_logging_handlers(logging_path) utils.logs.add_logger(logger, handlers) return logger INPUT = { "data": utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"), "meta": utils.URLPath( "/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/train.json.gz" ), } OUTPUT = utils.URLPath("output/samples") LOGPATH = utils.URLPath(f"logs/filter_samples_{utils.create_stamp()}.log") LOGGER = setup_logging(LOGPATH, "filter_samples") fc_seed.set_seed(42) OUTPUT.mkdir() train_dataset = io_functions.load_case_collection(INPUT["data"], INPUT["meta"])
def print_usage(): """print syntax of script invocation""" print("\nUsage:") print("python {0:} SOM_datapath outputpath panel(Erlangen, Bonn, MLL," "or Berlin) basemodel_path\n".format(os.path.basename(sys.argv[0]))) return if __name__ == "__main__": if len(sys.argv) != 6: print_usage() raise Exception("Invalid arguments") SOM_DATASET = utils.URLPath(sys.argv[1]) OUTPUT = utils.URLPath(sys.argv[2]) PANEL = sys.argv[3] BASE_MODEL_PATH = utils.URLPath(sys.argv[4]) EPOCHS = int(sys.argv[5]) LOGGER = utils.logs.setup_logging(None, "merged model with TL") # set the groups according to the panel if panel == "MLL": groups = GROUPS elif panel == "ERLANGEN": groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] else:
def load_case_collection(data: str, meta: str = None): data = utils.URLPath(data) if meta is not None: meta = utils.URLPath(meta) return io_functions.load_case_collection(data, meta)
def __init__(self, path, *args, **kwargs): super().__init__(*args, **kwargs) path = utils.URLPath(path) self.data = case_dataset.CaseCollection.from_path(path)
from collections import defaultdict from flowcat import io_functions, utils, seed as fc_seed INPUT = { "data": utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"), "meta": utils.URLPath( "/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/train.json.gz" ), "meta_test": utils.URLPath( "/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/test.json.gz" ), } train_dataset = io_functions.load_case_collection(INPUT["data"], INPUT["meta_test"]) sorted_cases = sorted(train_dataset, key=lambda c: c.infiltration if c.infiltration > 0.0 else 1000) perc01_count = 0 group_count = defaultdict(int) for case in sorted_cases[:100]: print("Minimal infiltration sample:", case, case.infiltration) if case.infiltration == 0.1: perc01_count += 1 group_count[case.group] += 1 print(perc01_count)
""" Acquire FCS information needed for Miflowcyt document. Also roughly check whether we have strongly diverging data in our dataset. """ from flowcat import dataset as fc_dataset, io_functions, utils import fcsparser def section(text, level=4, deco="#"): deco_text = deco * level section_text = f"{deco_text} {text} {deco_text}" print(section_text) train_dataset = io_functions.load_case_collection(utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"), utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/train.json.gz")) test_dataset = io_functions.load_case_collection(utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F"), utils.URLPath("/data/flowcat-data/mll-flowdata/decCLL-9F.2019-10-29.meta/test.json.gz")) print("Loading all data used in paper analysis.") dataset = train_dataset + test_dataset print(dataset) section("Get info for case 0") case = dataset[0] print(case) sample = case.samples[0] meta, data = fcsparser.parse(sample.complete_path) for i in range(1, 13): name = f"$P{i}S" voltage = f"$P{i}V"