def train(data: utils.URLPath, output: utils.URLPath): """Train a new classifier using SOM data.""" groups = GROUPS tubes = ("1", "2", "3") balance = { "CLL": 4000, "MBL": 2000, "MCL": 1000, "PL": 1000, "LPL": 1000, "MZL": 1000, "FL": 1000, "HCL": 1000, "normal": 6000, } mapping = None dataset = som_dataset.SOMDataset.from_path(data) train_dataset, validate_dataset = prepare_classifier_train_dataset( dataset, groups=groups, mapping=mapping, balance=balance) config = classifier.SOMClassifierConfig( **{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, }) model = train_som_classifier(train_dataset, validate_dataset, config) model.save(output) model.save_information(output)
def main(args): dataset = som_dataset.SOMDataset.from_path(args.input) val = args.val train = args.train OUTPUT = args.output PANEL = args.panel bal = args.bal # set the groups according to the panel if PANEL == "MLL": groups = GROUPS elif PANEL == "ERLANGEN": groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] else: groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] tubes = ("1") mapping = None balance = dict((key, bal) for key in groups) config = classifier.SOMClassifierConfig( **{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, "train_epochs": 20, }) val = io_functions.load_json(val) validate_dataset = dataset.filter(labels=val) labels = io_functions.load_json(train) train_dataset = dataset.filter(labels=labels) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( train_dataset, split_ratio=0.9, groups=groups, mapping=mapping, balance=balance, val_dataset=validate_dataset) print(train_dataset.group_count) print(validate_dataset.group_count) model = fc_api.train_som_classifier(train_dataset, validate_dataset, config) model.save(OUTPUT) model.save_information(OUTPUT)
def run_kfold(*, output_path, base_model_path, som_dataset_path, k_number=5, panel="MLL", rerun=False, stratified=False,): if not rerun and output_path.exists(): LOGGER.info("Existing results exist at %s skipping", output_path) return args = locals() io_functions.save_json(args, output_path / "params.json") # set the groups according to the panel if panel == "MLL": groups = GROUPS elif panel == "ERLANGEN": groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] else: groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] # tubes to be processed for merged samples tubes = ("1") mapping = {"groups": groups, "map": None} dataset = som_dataset.SOMDataset.from_path(som_dataset_path) LOGGER.info("Full dataset %s", dataset.group_count) splits = create_kfold_split(dataset, k_number=k_number, stratified=stratified) for n, (train_dataset, validate_dataset) in enumerate(splits): LOGGER.info(f"SPLIT n={n}") LOGGER.info("Train dataset %s", train_dataset.group_count) LOGGER.info("Validation dataset %s", validate_dataset.group_count) # change epochs to suit each dataset options = { "base_model_path": str(base_model_path / "model.h5"), "output_path": output_path / f"kfold_n{n}", "config": classifier.SOMClassifierConfig(**{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, "train_epochs": 15, }) } run_transfer(options, train_dataset, validate_dataset)
groups = GROUPS tubes = ("1") #tubes = ("1", "2") mapping = None dataset = som_dataset.SOMDataset.from_path(SOM_DATASET) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( dataset, split_ratio=0.3, groups=groups, mapping=mapping, balance=None) labels_dict = train_dataset.group_count config = classifier.SOMClassifierConfig( **{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, }) class_weight = create_class_weight(labels_dict) #class_weight = utils.classification_utils.calculate_group_weights(labels_dict) class_weight = {i: class_weight.get(g, 1.0) for i, g in enumerate(groups)} print(class_weight) model = fc_api.train_som_classifier(train_dataset, validate_dataset, config, class_weights=class_weight) model.save(OUTPUT)
def main(args): dataset = som_dataset.SOMDataset.from_path(args.input) val = args.val train = args.train OUTPUT = args.output groups = ["MCL", "PL"] tubes = ("1") mapping = None balance = { "MCL": 20, "PL": 20, } config = classifier.SOMClassifierConfig( **{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, }) val = io_functions.load_json(val) validate_dataset = dataset.filter(labels=val) labels = io_functions.load_json(train) train_dataset = dataset.filter(labels=labels) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( train_dataset, split_ratio=0.9, groups=groups, mapping=mapping, balance=balance, val_dataset=validate_dataset) print(train_dataset.group_count) print(validate_dataset.group_count) binarizer, model = load_model(args.model) trainseq = som_dataset.SOMSequence(train_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) validseq = som_dataset.SOMSequence(validate_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) model.fit_generator(generator=trainseq, epochs=10, validation_data=validseq) args.output.mkdir(parents=True, exist_ok=True) io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib") model.save(str(args.output / "model.h5")) io_functions.save_json(config.to_json(), OUTPUT / "config.json") io_functions.save_json(validseq.dataset.labels, OUTPUT / "ids_validate.json") io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")
def main(args): MLL5F = som_dataset.SOMDataset.from_path(args.input) OUTPUT = args.output #val_labels = args.val #train_labels = args.train #labels = args.labels LOGGER = utils.logs.setup_logging(None, "classify") groups = ["MCL", "PL"] tubes = ("1") mapping = None balance = { "MCL": 20, "PL": 20, } #vallabels = io_functions.load_json(val_labels) #validate_dataset = MLL5F.filter(labels=vallabels) #labels = io_functions.load_json(train_labels) #train_dataset = MLL5F.filter(labels=labels) #labels = io_functions.load_json(labels) #train_dataset = MLL5F.filter(labels=labels) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( MLL5F, split_ratio=0.90, groups=groups, mapping=mapping, balance=None)#, val_dataset = validate_dataset) print(train_dataset.group_count) print(validate_dataset.group_count) config = classifier.SOMClassifierConfig(**{"tubes": {tube: MLL5F.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, }) model = create_model(config.inputs, 1, global_decay=5e-3) model.compile( loss="binary_crossentropy", optimizer="adam", metrics=[ "acc", ] ) binarizer = LabelBinarizer() binarizer.fit(groups) trainseq = som_dataset.SOMSequence(train_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) validseq = som_dataset.SOMSequence(validate_dataset, binarizer, tube=config.tubes, pad_width=config.pad_width) model.fit_generator(generator=trainseq, validation_data=validseq, epochs=20, shuffle=True, class_weight=None) args.output.mkdir(parents=True, exist_ok=True) io_functions.save_joblib(binarizer, OUTPUT / "binarizer.joblib") model.save(str(args.output / "model.h5")) io_functions.save_json(config.to_json(), OUTPUT / "config.json") io_functions.save_json(validseq.dataset.labels, OUTPUT / "ids_validate.json") io_functions.save_json(trainseq.dataset.labels, OUTPUT / "ids_train.json")
def main(args): dataset = som_dataset.SOMDataset.from_path(args.input) val = args.val train = args.train OUTPUT = args.output PANEL = args.panel basemodel = args.basemodel bal = args.bal # set the groups according to the panel if PANEL == "MLL": groups = GROUPS elif PANEL == "ERLANGEN": groups = ["CLL", "MBL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] else: groups = ["CLL", "MCL", "LPL", "MZL", "FL", "HCL", "normal"] tubes = ("1") mapping = None balance = dict((key, bal) for key in groups) config = classifier.SOMClassifierConfig( **{ "tubes": {tube: dataset.config[tube] for tube in tubes}, "groups": groups, "pad_width": 2, "mapping": mapping, "cost_matrix": None, "train_epochs": 15, }) val = io_functions.load_json(val) validate_dataset = dataset.filter(labels=val) labels = io_functions.load_json(train) train_dataset = dataset.filter(labels=labels) train_dataset, validate_dataset = fc_api.prepare_classifier_train_dataset( train_dataset, split_ratio=0.9, groups=groups, mapping=mapping, balance=balance, val_dataset=validate_dataset) print(train_dataset.group_count) print(validate_dataset.group_count) # load base model and get weights base_model = models.load_model(str(basemodel / "model.h5")) weights = base_model.get_weights() # create model model = create_model(config.inputs, config.output) model.set_weights(weights) # freeze 2 dense layers: check for each dataset model.get_layer('dense_1').trainable = False model.get_layer('dense_2').trainable = False model.compile(loss=config.get_loss(modeldir=None), optimizer="adam", metrics=["accuracy"]) # cast to SOMConfig instance model = SOMClassifier(config, model) train = model.create_sequence(train_dataset, config.train_batch_size) if validate_dataset is not None: validate = model.create_sequence(validate_dataset, config.valid_batch_size) else: validate = None model.train_generator(train, validate, epochs=config.train_epochs, class_weight=None) model.save(OUTPUT) model.save_information(OUTPUT)