Exemplo n.º 1
0
    def save(self, path: utils.URLPath):
        """Save the given classifier model to the given path."""
        save_somclassifier_config(self.config, path / "config.json")
        self.model.save(str(path / "model.h5"))
        io_functions.save_joblib(self.binarizer, path / "binarizer.joblib")

        io_functions.save_json(self.data_ids["validation"],
                               path / "ids_validate.json")
        io_functions.save_json(self.data_ids["train"], path / "ids_train.json")
Exemplo n.º 2
0
def main(data: utils.URLPath,
         meta: utils.URLPath,
         output: utils.URLPath,
         epochs: int = 30):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("1", "2", "3")
    pad_width = 2

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]
    # mapping = None
    # groups = mappings.GROUPS

    # dataset = io_functions.load_case_collection(data, meta)
    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=groups)

    dataset_groups = {d.group for d in dataset}

    if set(groups) != dataset_groups:
        raise RuntimeError(
            f"Group mismatch: {groups}, but got {dataset_groups}")

    train, validate = dataset.create_split(0.9, stratify=True)

    group_weights = None
    # group_count = train.group_count
    # group_weights = classification_utils.calculate_group_weights(group_count)
    # group_weights = {
    #     i: group_weights.get(g, 1.0) for i, g in enumerate(groups)
    # }

    # train = train.balance(2000)
    train = train.balance_per_group({
        "CM": 6000,
        # "CLL": 4000,
        # "MBL": 2000,
        "MCL": 1000,
        "PL": 1000,
        "LPL": 1000,
        "MZL": 1000,
        "FL": 1000,
        "HCL": 1000,
        "normal": 6000,
    })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    # always (true, pred)
    cost_mapping = {
        ("CLL", "MBL"): 0.5,
        ("MBL", "CLL"): 0.5,
        ("MCL", "PL"): 0.5,
        ("PL", "MCL"): 0.5,
        ("LPL", "MZL"): 0.5,
        ("MZL", "LPL"): 0.5,
        ("CLL", "normal"): 8,
        ("MBL", "normal"): 8,
        ("MCL", "normal"): 8,
        ("PL", "normal"): 8,
        ("LPL", "normal"): 8,
        ("MZL", "normal"): 8,
        ("FL", "normal"): 8,
        ("HCL", "normal"): 8,
    }
    if mapping:
        cost_mapping = {(mapping.get(a, a), mapping.get(b, b)): v
                        for (a, b), v in cost_mapping.items()}
    # cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups)
    # np.save(str(output / "cost_matrix.npy"), cost_matrix)
    cost_matrix = None

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
        "cost_matrix": "cost_matrix.npy" if cost_matrix is not None else None,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width,
                                        z)

    binarizer, model = get_model(selected_tubes,
                                 groups=groups,
                                 global_decay=5e-5)

    if cost_matrix is not None:
        loss = classification_utils.WeightedCategoricalCrossentropy(
            cost_matrix)
    else:
        loss = "categorical_crossentropy"

    model.compile(
        loss=loss,
        # loss="categorical_crossentropy",
        # loss="binary_crossentropy",
        optimizer="adam",
        # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon),
        metrics=[
            keras.metrics.CategoricalAccuracy(),
        ])
    with (output / "model_summary.txt").open("w") as summary_file:

        def print_file(*args, **kwargs):
            print(*args, **kwargs, file=summary_file)

        model.summary(print_fn=print_file)

    keras.utils.plot_model(model, to_file=str(output / "model_plot.png"))

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(train,
                           binarizer,
                           tube=tubes,
                           get_array_fun=getter_fun,
                           batch_size=32,
                           pad_width=pad_width)
    validseq = SOMSequence(validate,
                           binarizer,
                           tube=tubes,
                           get_array_fun=getter_fun,
                           batch_size=128,
                           pad_width=pad_width)

    # tensorboard_dir = str(output / "tensorboard")
    # tensorboard_callback = keras.callbacks.TensorBoard(
    #     log_dir=str(tensorboard_dir),
    #     histogram_freq=5,
    #     write_grads=True,
    #     write_images=True,
    # )
    nan_callback = keras.callbacks.TerminateOnNaN()

    history = model.fit_generator(
        epochs=epochs,
        shuffle=True,
        callbacks=[
            # tensorboard_callback,
            nan_callback
        ],
        class_weight=group_weights,
        generator=trainseq,
        validation_data=validseq)

    model.save(str(output / "model.h5"))
    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    preds = []
    for pred in model.predict_generator(validseq):
        preds.append(pred)
    pred_arr = np.array(preds)
    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    generate_all_metrics(true_labels, pred_labels, {
        "groups": groups,
        "map": {}
    }, output / "unmapped")
    for map_name, mapping in mappings.GROUP_MAPS.items():
        output_path = output / map_name
        # skip if more groups in map
        print(f"--- MAPPING: {map_name} ---")
        if len(mapping["groups"]) > len(groups):
            continue
        generate_all_metrics(true_labels, pred_labels, mapping, output_path)

    plot_training_history(history, output / "training.png")
Exemplo n.º 3
0
def main(data: utils.URLPath, output: utils.URLPath, model_name: str, modelargs: json.loads, epochs: int = 30):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("1", "2", "3")
    pad_width = 0

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]
    # mapping = None
    # groups = mappings.GROUPS

    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=groups)

    dataset_groups = {d.group for d in dataset}

    if set(groups) != dataset_groups:
        raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}")

    train, validate = dataset.create_split(0.9, stratify=True)

    # train = train.balance(20)
    train = train.balance_per_group({
        "CM": 6000,
        # "CLL": 4000,
        # "MBL": 2000,
        "MCL": 1000,
        "PL": 1000,
        "LPL": 1000,
        "MZL": 1000,
        "FL": 1000,
        "HCL": 1000,
        "normal": 6000,
    })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
        "cost_matrix": None,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z)

    # binarizer, model = get_model(selected_tubes, groups=groups, n_neighbors=1)
    binarizer, model = get_model(selected_tubes, groups=groups, model_name="RandomForest", **modelargs)

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(
        train,
        binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=32,
        pad_width=pad_width)
    validseq = SOMSequence(
        validate,
        binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=128,
        pad_width=pad_width)

    xdata, ydata = sequence_to_array(trainseq)

    model.fit(xdata, ydata)

    xtest, ytest = sequence_to_array(validseq)
    pred_arr = model.predict(xtest)

    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    generate_all_metrics(
        true_labels, pred_labels, {"groups": groups, "map": {}}, output / "unmapped")
    for map_name, mapping in mappings.GROUP_MAPS.items():
        output_path = output / map_name
        # skip if more groups in map
        print(f"--- MAPPING: {map_name} ---")
        if len(mapping["groups"]) > len(groups):
            continue
        generate_all_metrics(true_labels, pred_labels, mapping, output_path)
Exemplo n.º 4
0
def main(args):
    dataset = som_dataset.SOMDataset.from_path(args.input)
    val = args.val
    train = args.train
    OUTPUT = args.output

    groups = ["MCL", "PL"]
    tubes = ("1")
    mapping = None
    balance = {
        "MCL": 20,
        "PL": 20,
    }

    config = classifier.SOMClassifierConfig(
        **{
            "tubes": {tube: dataset.config[tube]
                      for tube in tubes},
            "groups": groups,
            "pad_width": 2,
            "mapping": mapping,
            "cost_matrix": None,
        })
    val = io_functions.load_json(val)
    validate_dataset = dataset.filter(labels=val)

    labels = io_functions.load_json(train)
    train_dataset = dataset.filter(labels=labels)

    train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
        train_dataset,
        split_ratio=0.9,
        groups=groups,
        mapping=mapping,
        balance=balance,
        val_dataset=validate_dataset)

    print(train_dataset.group_count)
    print(validate_dataset.group_count)

    binarizer, model = load_model(args.model)

    trainseq = som_dataset.SOMSequence(train_dataset,
                                       binarizer,
                                       tube=config.tubes,
                                       pad_width=config.pad_width)
    validseq = som_dataset.SOMSequence(validate_dataset,
                                       binarizer,
                                       tube=config.tubes,
                                       pad_width=config.pad_width)

    model.fit_generator(generator=trainseq,
                        epochs=10,
                        validation_data=validseq)

    args.output.mkdir(parents=True, exist_ok=True)
    io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib")
    model.save(str(args.output / "model.h5"))

    io_functions.save_json(config.to_json(), OUTPUT / "config.json")
    io_functions.save_json(validseq.dataset.labels,
                           OUTPUT / "ids_validate.json")
    io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")
Exemplo n.º 5
0
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath):
    """
    Args:
        data: Path to fcs dataset data
        meta: Path to fcs dataset metainformation
        output: Output path
    """
    tubes = ("1", "2")
    sample_size = 512
    # group_mapping = mappings.GROUP_MAPS["6class"]
    # mapping = group_mapping["map"]
    mapping = None
    groups = mappings.GROUPS
    # groups = group_mapping["groups"]

    dataset = io_functions.load_case_collection(data, meta)
    if mapping:
        dataset = dataset.map_groups(mapping)
    dataset = dataset.filter(groups=groups)

    validate, train = dataset.create_split(50)
    print(train.group_count)
    # train = train.balance(1000).shuffle()
    train = train.sample(100).shuffle()
    print(train.group_count)

    group_count = train.group_count
    group_weights = classification_utils.calculate_group_weights(group_count)
    group_weights = {
        i: group_weights.get(g, 1.0)
        for i, g in enumerate(groups)
    }

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    binarizer = LabelBinarizer()
    binarizer.fit(groups)

    train_seq = FCSSequence(train,
                            binarizer,
                            tubes=tubes,
                            sample_size=sample_size,
                            batch_size=64)
    validate_seq = FCSSequence(validate,
                               binarizer,
                               tubes=tubes,
                               sample_size=sample_size,
                               batch_size=128)

    config = {
        "tubes": tubes,
        "groups": groups,
    }
    io_functions.save_json(config, output / "config.json")

    # for tube in tubes:
    #     x, y, z = selected_tubes[tube]["dims"]
    #     selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z)

    cost_mapping = {
        ("CLL", "MBL"): 0.5,
        ("MBL", "CLL"): 0.5,
        ("MCL", "PL"): 0.5,
        ("PL", "MCL"): 0.5,
        ("LPL", "MZL"): 0.5,
        ("MZL", "LPL"): 0.5,
        ("CLL", "normal"): 2,
        ("MBL", "normal"): 2,
        ("MCL", "normal"): 2,
        ("PL", "normal"): 2,
        ("LPL", "normal"): 2,
        ("MZL", "normal"): 2,
        ("FL", "normal"): 2,
        ("HCL", "normal"): 2,
    }
    cost_matrix = classification_utils.build_cost_matrix(cost_mapping, groups)

    model = create_fcs_model(train_seq.xshape,
                             train_seq.yshape,
                             global_decay=5e-5)
    model.compile(
        # loss="categorical_crossentropy",
        # loss=keras.losses.CategoricalCrossentropy(),
        loss=classification_utils.WeightedCategoricalCrossentropy(cost_matrix),
        # loss="binary_crossentropy",
        optimizer="adam",
        # optimizer=optimizers.Adam(lr=0.0, decay=0.0, epsilon=epsilon),
        metrics=[
            "acc",
            # keras.metrics.CategoricalAccuracy(),
            # keras.metrics.TopKCategoricalAccuracy(k=2),
            # top2_acc,
        ])
    model.summary()

    tensorboard_dir = str(output / "tensorboard")
    tensorboard_callback = keras.callbacks.TensorBoard(
        log_dir=str(tensorboard_dir),
        histogram_freq=5,
        write_grads=True,
        write_images=True,
    )
    nan_callback = keras.callbacks.TerminateOnNaN()

    model.fit_generator(
        epochs=20,
        shuffle=True,
        callbacks=[
            # tensorboard_callback,
            nan_callback
        ],
        class_weight=group_weights,
        generator=train_seq,
        validation_data=validate_seq)

    model.save(str(output / "model.h5"))
    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    preds = []
    for pred in model.predict_generator(validate_seq):
        preds.append(pred)
    pred_arr = np.array(preds)
    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validate_seq.true_labels

    generate_all_metrics(true_labels, pred_labels, {
        "groups": groups,
        "map": {}
    }, output / "unmapped")
    for map_name, mapping in mappings.GROUP_MAPS.items():
        output_path = output / map_name
        # skip if more groups in map
        print(f"--- MAPPING: {map_name} ---")
        if len(mapping["groups"]) > len(groups):
            continue
        generate_all_metrics(true_labels, pred_labels, mapping, output_path)
Exemplo n.º 6
0
def main(data: utils.URLPath, meta: utils.URLPath, output: utils.URLPath):
    """
    Args:
        data: Path to som dataset
        output: Output path
    """
    tubes = ("2", "3", "4")
    pad_width = 1

    group_mapping = mappings.GROUP_MAPS["8class"]
    mapping = group_mapping["map"]
    groups = group_mapping["groups"]

    # dataset = io_functions.load_case_collection(data, meta)
    dataset = SOMDataset.from_path(data)
    if mapping:
        dataset = dataset.map_groups(mapping)

    dataset = dataset.filter(groups=[g for g in groups if g not in ("LPL", "MZL")])

    dataset_groups = {d.group for d in dataset}

    # if set(groups) != dataset_groups:
    #     raise RuntimeError(f"Group mismatch: {groups}, but got {dataset_groups}")

    validate, train = dataset.create_split(10, stratify=True)

    group_count = train.group_count
    num_cases = sum(group_count.values())
    balanced_nums = num_cases / len(dataset_groups)
    balanced_loss_weights = [balanced_nums / group_count.get(g, balanced_nums) for g in groups]
    min_ratio = min(balanced_loss_weights)
    balanced_loss_weights = {i: v / min_ratio for i, v in enumerate(balanced_loss_weights)}
    print(balanced_loss_weights)

    # train = train.balance(2000)
    # train = train.balance_per_group({
    #     "CM": 6000,
    #     # "CLL": 4000,
    #     # "MBL": 2000,
    #     "MCL": 1000,
    #     "PL": 1000,
    #     "LPL": 1000,
    #     "MZL": 1000,
    #     "FL": 1000,
    #     "HCL": 1000,
    #     "normal": 6000,
    # })

    io_functions.save_json(train.labels, output / "ids_train.json")
    io_functions.save_json(validate.labels, output / "ids_validate.json")

    som_config = io_functions.load_json(data + "_config.json")
    selected_tubes = {tube: som_config[tube] for tube in tubes}

    config = {
        "tubes": selected_tubes,
        "groups": groups,
        "pad_width": pad_width,
        "mapping": group_mapping,
    }
    io_functions.save_json(config, output / "config.json")

    for tube in tubes:
        x, y, z = selected_tubes[tube]["dims"]
        selected_tubes[tube]["dims"] = (x + 2 * pad_width, y + 2 * pad_width, z)

    binarizer, model = get_model(selected_tubes, groups=groups, global_decay=5e-7)

    def getter_fun(sample, tube):
        return sample.get_tube(tube)

    trainseq = SOMSequence(
        train, binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=32,
        pad_width=pad_width)
    validseq = SOMSequence(
        validate, binarizer,
        tube=tubes,
        get_array_fun=getter_fun,
        batch_size=128,
        pad_width=pad_width)

    tensorboard_dir = str(output / "tensorboard")
    tensorboard_callback = keras.callbacks.TensorBoard(
        log_dir=str(tensorboard_dir),
        histogram_freq=5,
        write_grads=True,
        write_images=True,
    )
    nan_callback = keras.callbacks.TerminateOnNaN()

    model.fit_generator(
        epochs=15, shuffle=True,
        callbacks=[tensorboard_callback, nan_callback],
        class_weight=balanced_loss_weights,
        generator=trainseq, validation_data=validseq)

    model.save(str(output / "model.h5"))
    io_functions.save_joblib(binarizer, output / "binarizer.joblib")

    preds = []
    for pred in model.predict_generator(validseq):
        preds.append(pred)
    pred_arr = np.array(preds)
    pred_labels = binarizer.inverse_transform(pred_arr)
    true_labels = validseq.true_labels

    confusion = metrics.confusion_matrix(true_labels, pred_labels, labels=groups)
    print(groups)
    print(confusion)
    balanced = metrics.balanced_accuracy_score(true_labels, pred_labels)
    print(balanced)
Exemplo n.º 7
0
def main(args):
    MLL5F = som_dataset.SOMDataset.from_path(args.input)
    OUTPUT = args.output
    #val_labels = args.val
    #train_labels = args.train
    #labels = args.labels
    LOGGER = utils.logs.setup_logging(None, "classify")

    groups = ["MCL", "PL"]
    tubes = ("1")
    mapping = None
    balance = {
        "MCL": 20,
        "PL": 20,
    }

    #vallabels = io_functions.load_json(val_labels)
    #validate_dataset = MLL5F.filter(labels=vallabels)

    #labels = io_functions.load_json(train_labels)
    #train_dataset = MLL5F.filter(labels=labels)

    #labels = io_functions.load_json(labels)
    #train_dataset = MLL5F.filter(labels=labels)

   
    train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset(
        MLL5F,
        split_ratio=0.90,
        groups=groups,
        mapping=mapping,
        balance=None)#, val_dataset = validate_dataset)

    print(train_dataset.group_count)
    print(validate_dataset.group_count)

    config = classifier.SOMClassifierConfig(**{"tubes": {tube: MLL5F.config[tube] for tube in tubes},
                                               "groups": groups,
                                               "pad_width": 2,
                                               "mapping": mapping,
                                               "cost_matrix": None,
                                               })

    model = create_model(config.inputs, 1, global_decay=5e-3)

    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=[
            "acc",
        ]
    )

    binarizer = LabelBinarizer()
    binarizer.fit(groups)

    trainseq = som_dataset.SOMSequence(train_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width)
    validseq = som_dataset.SOMSequence(validate_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width)

    model.fit_generator(generator=trainseq, validation_data=validseq,
                                epochs=20, shuffle=True, class_weight=None)

    args.output.mkdir(parents=True, exist_ok=True)
    io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib")
    model.save(str(args.output / "model.h5"))

    io_functions.save_json(config.to_json(), OUTPUT / "config.json")
    io_functions.save_json(validseq.dataset.labels, OUTPUT / "ids_validate.json")
    io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")