def main(args): munich = SOMDataset.from_path(args.input) # bonn = SOMDataset.from_path("output/01b-create-soms/testbonn") # visualize tsne first # visualize_tsne(munich, "testmunich.png") # visualize_tsne(bonn, "testbonn.png") train, validate = munich.split(ratio=0.9, stratified=True) model = create_model(munich.dims, 1, global_decay=5e-3) model.compile( # loss="categorical_crossentropy", loss="binary_crossentropy", optimizer="adam", # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon), metrics=[ "acc", # top2_acc, ] ) binarizer = LabelBinarizer() binarizer.fit(["CLL", "normal"]) trainseq = SOMSequence(train, binarizer, tube=1) validseq = SOMSequence(validate, binarizer, tube=1) model.fit_generator( epochs=20, generator=trainseq, validation_data=validseq) args.output.local.mkdir(parents=True, exist_ok=True) utils.save_joblib(binarizer, args.output / "binarizer.joblib") model.save(str(args.output / "model.h5"))
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath, epochs: int = 30): """ Args: data: Path to som dataset output: Output path """ tubes = ("1", "2", "3") pad_width = 2 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # mapping = None # groups = mappings.GROUPS # dataset = io_functions.load_case_collection(data, meta) dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=groups) dataset_groups = {d.group for d in dataset} if set(groups) != dataset_groups: raise RuntimeError( f"Group mismatch: {groups}, but got {dataset_groups}") train, validate = dataset.create_split(0.9, stratify=True) group_weights = None # group_count = train.group_count # group_weights = classification_utils.calculate_group_weights(group_count) # group_weights = { # i: group_weights.get(g, 1.0) for i, g in enumerate(groups) # } # train = train.balance(2000) train = train.balance_per_group({ "CM": 6000, # "CLL": 4000, # "MBL": 2000, "MCL": 1000, "PL": 1000, "LPL": 1000, "MZL": 1000, "FL": 1000, "HCL": 1000, "normal": 6000, }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} # always (true, pred) cost_mapping = { ("CLL", "MBL"): 0.5, ("MBL", "CLL"): 0.5, ("MCL", "PL"): 0.5, ("PL", "MCL"): 0.5, ("LPL", "MZL"): 0.5, ("MZL", "LPL"): 0.5, ("CLL", "normal"): 8, ("MBL", "normal"): 8, ("MCL", "normal"): 8, ("PL", "normal"): 8, ("LPL", "normal"): 8, ("MZL", "normal"): 8, ("FL", "normal"): 8, ("HCL", "normal"): 8, } if mapping: cost_mapping = {(mapping.get(a, a), mapping.get(b, b)): v for (a, b), v in cost_mapping.items()} # cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups) # np.save(str(output / "cost_matrix.npy"), cost_matrix) cost_matrix = None config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, "cost_matrix": "cost_matrix.npy" if cost_matrix is not None else None, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-5) if cost_matrix is not None: loss = classification_utils.WeightedCategoricalCrossentropy( cost_matrix) else: loss = "categorical_crossentropy" model.compile( loss=loss, # loss="categorical_crossentropy", # loss="binary_crossentropy", optimizer="adam", # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon), metrics=[ keras.metrics.CategoricalAccuracy(), ]) with (output / "model_summary.txt").open("w") as summary_file: def print_file(*args, **kwargs): print(*args, **kwargs, file=summary_file) model.summary(print_fn=print_file) keras.utils.plot_model(model, to_file=str(output / "model_plot.png")) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence(train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence(validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) # tensorboard_dir = str(output / "tensorboard") # tensorboard_callback = keras.callbacks.TensorBoard( # log_dir=str(tensorboard_dir), # histogram_freq=5, # write_grads=True, # write_images=True, # ) nan_callback = keras.callbacks.TerminateOnNaN() history = model.fit_generator( epochs=epochs, shuffle=True, callbacks=[ # tensorboard_callback, nan_callback ], class_weight=group_weights, generator=trainseq, validation_data=validseq) model.save(str(output / "model.h5")) io_functions.save_joblib(binarizer, output / "binarizer.joblib") preds = [] for pred in model.predict_generator(validseq): preds.append(pred) pred_arr = np.array(preds) pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels generate_all_metrics(true_labels, pred_labels, { "groups": groups, "map": {} }, output / "unmapped") for map_name, mapping in mappings.GROUP_MAPS.items(): output_path = output / map_name # skip if more groups in map print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(groups): continue generate_all_metrics(true_labels, pred_labels, mapping, output_path) plot_training_history(history, output / "training.png")
def main(data: utils.URLPath, output: utils.URLPath, model_name: str, modelargs: json.loads, epochs: int = 30): """ Args: data: Path to som dataset output: Output path """ tubes = ("1", "2", "3") pad_width = 0 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # mapping = None # groups = mappings.GROUPS dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=groups) dataset_groups = {d.group for d in dataset} if set(groups) != dataset_groups: raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}") train, validate = dataset.create_split(0.9, stratify=True) # train = train.balance(20) train = train.balance_per_group({ "CM": 6000, # "CLL": 4000, # "MBL": 2000, "MCL": 1000, "PL": 1000, "LPL": 1000, "MZL": 1000, "FL": 1000, "HCL": 1000, "normal": 6000, }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, "cost_matrix": None, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) # binarizer, model = get_model(selected_tubes, groups=groups, n_neighbors=1) binarizer, model = get_model(selected_tubes, groups=groups, model_name="RandomForest", **modelargs) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence( train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence( validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) xdata, ydata = sequence_to_array(trainseq) model.fit(xdata, ydata) xtest, ytest = sequence_to_array(validseq) pred_arr = model.predict(xtest) io_functions.save_joblib(binarizer, output / "binarizer.joblib") pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels generate_all_metrics( true_labels, pred_labels, {"groups": groups, "map": {}}, output / "unmapped") for map_name, mapping in mappings.GROUP_MAPS.items(): output_path = output / map_name # skip if more groups in map print(f"--- MAPPING: {map_name} ---") if len(mapping["groups"]) > len(groups): continue generate_all_metrics(true_labels, pred_labels, mapping, output_path)
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath): """ Args: data: Path to som dataset output: Output path """ tubes = ("2", "3", "4") pad_width = 1 group_mapping = mappings.GROUP_MAPS["8class"] mapping = group_mapping["map"] groups = group_mapping["groups"] # dataset = io_functions.load_case_collection(data, meta) dataset = SOMDataset.from_path(data) if mapping: dataset = dataset.map_groups(mapping) dataset = dataset.filter(groups=[g for g in groups if g not in ("LPL", "MZL")]) dataset_groups = {d.group for d in dataset} # if set(groups) != dataset_groups: # raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}") validate, train = dataset.create_split(10, stratify=True) group_count = train.group_count num_cases = sum(group_count.values()) balanced_nums = num_cases / len(dataset_groups) balanced_loss_weights = [balanced_nums / group_count.get(g, balanced_nums) for g in groups] min_ratio = min(balanced_loss_weights) balanced_loss_weights = {i: v / min_ratio for i, v in enumerate(balanced_loss_weights)} print(balanced_loss_weights) # train = train.balance(2000) # train = train.balance_per_group({ # "CM": 6000, # # "CLL": 4000, # # "MBL": 2000, # "MCL": 1000, # "PL": 1000, # "LPL": 1000, # "MZL": 1000, # "FL": 1000, # "HCL": 1000, # "normal": 6000, # }) io_functions.save_json(train.labels, output / "ids_train.json") io_functions.save_json(validate.labels, output / "ids_validate.json") som_config = io_functions.load_json(data + "_config.json") selected_tubes = {tube: som_config[tube] for tube in tubes} config = { "tubes": selected_tubes, "groups": groups, "pad_width": pad_width, "mapping": group_mapping, } io_functions.save_json(config, output / "config.json") for tube in tubes: x, y, z = selected_tubes[tube]["dims"] selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z) binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-7) def getter_fun(sample, tube): return sample.get_tube(tube) trainseq = SOMSequence( train, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=32, pad_width=pad_width) validseq = SOMSequence( validate, binarizer, tube=tubes, get_array_fun=getter_fun, batch_size=128, pad_width=pad_width) tensorboard_dir = str(output / "tensorboard") tensorboard_callback = keras.callbacks.TensorBoard( log_dir=str(tensorboard_dir), histogram_freq=5, write_grads=True, write_images=True, ) nan_callback = keras.callbacks.TerminateOnNaN() model.fit_generator( epochs=15, shuffle=True, callbacks=[tensorboard_callback, nan_callback], class_weight=balanced_loss_weights, generator=trainseq, validation_data=validseq) model.save(str(output / "model.h5")) io_functions.save_joblib(binarizer, output / "binarizer.joblib") preds = [] for pred in model.predict_generator(validseq): preds.append(pred) pred_arr = np.array(preds) pred_labels = binarizer.inverse_transform(pred_arr) true_labels = validseq.true_labels confusion = metrics.confusion_matrix(true_labels, pred_labels, labels=groups) print(groups) print(confusion) balanced = metrics.balanced_accuracy_score(true_labels, pred_labels) print(balanced)
def get_validation_data( self, dataset: som_dataset.SOMDataset) -> som_dataset.SOMDataset: return dataset.filter(labels=self.data_ids["validation"])