示例#1
0
def makeplots(tc, path, savepath, remove_structural: bool, nfolds,
              binarize_list, softmax_list, models_list, priors_list, **kwargs):

    _, _, _, _, _, label_encoder, _, _ = \
        get_data_per_cell_type(single_cell_types=single_cell_types, remove_structural=remove_structural)
    target_classes = string2vec(tc, label_encoder)

    lrs_for_model_per_fold = OrderedDict()
    emtpy_numpy_array = np.zeros(
        (nfolds, len(binarize_list), len(softmax_list), len(models_list),
         len(priors_list)))
    accuracies_train, accuracies_test, accuracies_test_as_mixtures, accuracies_mixtures, accuracies_single, \
    cllr_test, cllr_test_as_mixtures, cllr_mixtures, coeffs = [dict() for i in range(9)]

    for target_class in target_classes:
        target_class_str = vec2string(target_class, label_encoder)

        accuracies_train[target_class_str] = emtpy_numpy_array.copy()
        accuracies_test[target_class_str] = emtpy_numpy_array.copy()
        accuracies_test_as_mixtures[target_class_str] = emtpy_numpy_array.copy(
        )
        accuracies_mixtures[target_class_str] = emtpy_numpy_array.copy()
        accuracies_single[target_class_str] = emtpy_numpy_array.copy()

        cllr_test[target_class_str] = emtpy_numpy_array.copy()
        cllr_test_as_mixtures[target_class_str] = emtpy_numpy_array.copy()
        cllr_mixtures[target_class_str] = emtpy_numpy_array.copy()
        coeffs[target_class_str] = np.zeros(
            (nfolds, len(binarize_list), 1, len(marker_names) - 4 + 1,
             len(priors_list)))

    for n in range(nfolds):
        lrs_for_model_per_fold[str(n)] = pickle.load(
            open(os.path.join(path, 'lrs_for_model_in_fold_{}'.format(n)),
                 'rb'))

        for target_class in target_classes:
            target_class_str = vec2string(target_class, label_encoder)
            target_class_save = target_class_str.replace(" ", "_")
            target_class_save = target_class_save.replace(".", "_")
            target_class_save = target_class_save.replace("/", "_")
            accuracies_train[target_class_str][n, :, :, :, :] = pickle.load(
                open(
                    os.path.join(
                        path,
                        'accuracies_train_{}_{}'.format(target_class_save, n)),
                    'rb'))
            accuracies_test[target_class_str][n, :, :, :, :] = pickle.load(
                open(
                    os.path.join(
                        path,
                        'accuracies_test_{}_{}'.format(target_class_save, n)),
                    'rb'))
            accuracies_test_as_mixtures[target_class_str][
                n, :, :, :, :] = pickle.load(
                    open(
                        os.path.join(
                            path, 'accuracies_test_as_mixt_{}_{}'.format(
                                target_class_save, n)), 'rb'))
            accuracies_mixtures[target_class_str][n, :, :, :, :] = pickle.load(
                open(
                    os.path.join(
                        path,
                        'accuracies_mixt_{}_{}'.format(target_class_save, n)),
                    'rb'))
            accuracies_single[target_class_str][n, :, :, :, :] = pickle.load(
                open(
                    os.path.join(
                        path,
                        'accuracies_single_{}_{}'.format(target_class_save,
                                                         n)), 'rb'))

            cllr_test[target_class_str][n, :, :, :, :] = pickle.load(
                open(
                    os.path.join(
                        path, 'cllr_test_{}_{}'.format(target_class_save, n)),
                    'rb'))
            cllr_test_as_mixtures[target_class_str][
                n, :, :, :, :] = pickle.load(
                    open(
                        os.path.join(
                            path, 'cllr_test_as_mixt_{}_{}'.format(
                                target_class_save, n)), 'rb'))
            cllr_mixtures[target_class_str][n, :, :, :, :] = pickle.load(
                open(
                    os.path.join(
                        path, 'cllr_mixt_{}_{}'.format(target_class_save, n)),
                    'rb'))
            coeffs[target_class_str][n, :, :, :, :] = pickle.load(
                open(
                    os.path.join(path,
                                 'coeffs_{}_{}'.format(target_class_save, n)),
                    'rb'))

    types_data = ['test augm', 'mixt']

    for type_data in types_data:
        lrs_before_for_all_methods, lrs_after_for_all_methods, y_nhot_for_all_methods = append_lrs_for_all_folds(
            lrs_for_model_per_fold, type=type_data)

        # plot_pavs_all_methods(lrs_before_for_all_methods, lrs_after_for_all_methods, y_nhot_for_all_methods,
        #                           target_classes, label_encoder, savefig=os.path.join(savepath, 'pav_{}'.format(type_data)))

        for kind in ['roc', 'histogram']:
            plot_property_all_lrs_all_folds(lrs_after_for_all_methods,
                                            y_nhot_for_all_methods,
                                            target_classes,
                                            label_encoder,
                                            kind=kind,
                                            savefig=os.path.join(
                                                savepath,
                                                f'{kind}_{type_data}'))


    lrs_before_for_all_methods, lrs_after_for_all_methods, \
    y_nhot_for_all_methods = append_lrs_for_all_folds(
        lrs_for_model_per_fold, type='test augm')
    if len(priors_list) > 1:
        plot_scatterplots_all_lrs_different_priors(
            lrs_after_for_all_methods,
            y_nhot_for_all_methods,
            target_classes,
            label_encoder,
            savefig=os.path.join(
                savepath, 'LRs_for_different_priors_{}'.format(type_data)))
    if nfolds > 1:
        for t, target_class in enumerate(target_classes):
            target_class_str = vec2string(target_class, label_encoder)
            target_class_save = target_class_str.replace(" ", "_")
            target_class_save = target_class_save.replace(".", "_")
            target_class_save = target_class_save.replace("/", "_")

            plot_boxplot_of_metric(
                binarize_list,
                softmax_list,
                models_list,
                priors_list,
                cllr_test[target_class_str],
                label_encoder,
                "$C_{llr}$",
                savefig=os.path.join(
                    savepath,
                    'boxplot_cllr_test_{}'.format(target_class_save)))
            plot_boxplot_of_metric(
                binarize_list,
                softmax_list,
                models_list,
                priors_list,
                cllr_mixtures[target_class_str],
                label_encoder,
                "$C_{llr}$",
                savefig=os.path.join(
                    savepath,
                    'boxplot_cllr_mixtures_{}'.format(target_class_save)))
            if DEBUG:
                plot_boxplot_of_metric(
                    binarize_list,
                    softmax_list,
                    models_list,
                    priors_list,
                    accuracies_train[target_class_str],
                    label_encoder,
                    'accuracy',
                    savefig=os.path.join(
                        savepath,
                        'boxplot_accuracy_train_{}'.format(target_class_save)))
                plot_boxplot_of_metric(
                    binarize_list,
                    softmax_list,
                    models_list,
                    priors_list,
                    accuracies_test[target_class_str],
                    label_encoder,
                    "accuracy",
                    savefig=os.path.join(
                        savepath,
                        'boxplot_accuracy_test_{}'.format(target_class_save)))
                plot_boxplot_of_metric(
                    binarize_list,
                    softmax_list,
                    models_list,
                    priors_list,
                    cllr_test_as_mixtures[target_class_str],
                    label_encoder,
                    "$C_{llr}$",
                    savefig=os.path.join(
                        savepath, 'boxplot_cllr_test_as_mixt_{}'.format(
                            target_class_save)))
                plot_progress_of_metric(
                    binarize_list,
                    softmax_list,
                    models_list,
                    priors_list,
                    accuracies_train[target_class_str],
                    label_encoder,
                    'accuracy',
                    savefig=os.path.join(
                        savepath, 'progress_accuracy_train_{}'.format(
                            target_class_save)))
                plot_progress_of_metric(
                    binarize_list,
                    softmax_list,
                    models_list,
                    priors_list,
                    accuracies_test[target_class_str],
                    label_encoder,
                    'accuracy',
                    savefig=os.path.join(
                        savepath,
                        'progress_accuracy_test_{}'.format(target_class_save)))
                plot_progress_of_metric(
                    binarize_list,
                    softmax_list,
                    models_list,
                    priors_list,
                    cllr_test[target_class_str],
                    label_encoder,
                    '$C_{llr}$',
                    savefig=os.path.join(
                        savepath,
                        'progress_cllr_test_{}'.format(target_class_save)))
                plot_boxplot_of_metric(
                    binarize_list, [False],
                    [[a, True] for a in ['intercept'] + marker_names],
                    priors_list,
                    coeffs[target_class_str],
                    label_encoder,
                    "log LR",
                    savefig=os.path.join(
                        savepath,
                        'boxplot_coefficients_{}'.format(target_class_save)),
                    ylim=[-3, 3])
示例#2
0
def get_final_trained_mlr_model(
        tc,
        single_cell_types,
        retrain,
        n_samples_per_combination,
        binarize=True,
        from_penile=False,
        prior=(1, 1, 1, 1, 1, 1, 1, 1),
        model_name='best_MLR',
        remove_structural=True,
        save_path=None,
        alternative_hypothesis=None,
        # blood, nasal, vaginal
        samples_to_evaluate=np.array([[1] * 3 + [0] + [1] * 5 + [0] * 6]),
        use_mixtures=True):
    """
    computes or loads the MLR based on all data
    """
    softmax = False
    mle = MultiLabelEncoder(len(single_cell_types))

    X_single, y_nhot_single, n_celltypes, n_features, n_per_celltype, label_encoder, present_markers, present_celltypes = \
        get_data_per_cell_type(single_cell_types=single_cell_types, remove_structural=True)

    y_single = mle.transform_single(mle.nhot_to_labels(y_nhot_single))
    target_classes = string2vec(tc, label_encoder)

    save_data_table(X_single,
                    [vec2string(y, label_encoder) for y in y_nhot_single],
                    present_markers,
                    os.path.join(save_path, 'single cell data.csv'))

    if retrain:
        model = clf_with_correct_settings('MLR',
                                          softmax=softmax,
                                          n_classes=-1,
                                          with_calibration=True)
        X_train, X_calib, y_train, y_calib = train_test_split(
            X_single, y_single, stratify=y_single, test_size=0.5)
        if use_mixtures:
            X_mixtures, y_nhot_mixtures, mixture_label_encoder = read_mixture_data(
                n_celltypes,
                label_encoder,
                binarize=binarize,
                remove_structural=remove_structural)

            save_data_table(X_mixtures, [
                vec2string(y, label_encoder).replace(' and/or ', '+')
                for y in y_nhot_mixtures
            ], present_markers, os.path.join(save_path, 'mixture data.csv'))
        augmented_data = augment_splitted_data(X_train,
                                               y_train,
                                               X_calib,
                                               y_calib,
                                               None,
                                               None,
                                               None,
                                               n_celltypes,
                                               n_features,
                                               label_encoder,
                                               prior, [binarize],
                                               from_penile,
                                               [n_samples_per_combination] * 3,
                                               disallowed_mixtures=None)

        indices = [
            np.argwhere(target_classes[i, :] == 1).flatten().tolist()
            for i in range(target_classes.shape[0])
        ]
        y_train = np.array([
            np.max(np.array(augmented_data.y_train_nhot_augmented[:,
                                                                  indices[i]]),
                   axis=1) for i in range(len(indices))
        ]).T
        # y_calib = np.array([np.max(np.array(augmented_data.y_calib_nhot_augmented[:, indices[i]]), axis=1) for i in range(len(indices))]).T

        model.fit_classifier(augmented_data.X_train_augmented, y_train)
        model.fit_calibration(augmented_data.X_calib_augmented,
                              augmented_data.y_calib_nhot_augmented,
                              target_classes)
        pickle.dump(
            model, open('{}'.format(os.path.join(save_path, model_name)),
                        'wb'))
    else:
        model = pickle.load(
            open('{}'.format(os.path.join(save_path, model_name)), 'rb'))

    if alternative_hypothesis:
        # also plot LRs of our hypothesis pairs against LRs when H2 is more specific
        implied_target = string2vec(
            ['Vaginal.mucosa and/or Menstrual.secretion'], label_encoder)
        alternative_target = string2vec(alternative_hypothesis, label_encoder)
        # at least one of H1 or alternative should be present, disallow absence of all:
        disallowed_mixtures = (-implied_target - alternative_target).astype(
            np.int)

        X_train, X_calib, y_train, y_calib = train_test_split(
            X_single, y_single, stratify=y_single, test_size=0.5)

        X_mixtures, y_nhot_mixtures, mixture_label_encoder = read_mixture_data(
            n_celltypes,
            label_encoder,
            binarize=binarize,
            remove_structural=remove_structural)

        augmented_data = augment_splitted_data(
            X_train,
            y_train,
            X_calib,
            y_calib,
            None,
            None,
            y_nhot_mixtures,
            n_celltypes,
            n_features,
            label_encoder,
            prior, [binarize],
            from_penile, [n_samples_per_combination] * 3,
            disallowed_mixtures=disallowed_mixtures)

        indices = [
            np.argwhere(target_classes[i, :] == 1).flatten().tolist()
            for i in range(target_classes.shape[0])
        ]
        y_train = np.array([
            np.max(np.array(augmented_data.y_train_nhot_augmented[:,
                                                                  indices[i]]),
                   axis=1) for i in range(len(indices))
        ]).T
        specific_model = clf_with_correct_settings('MLR',
                                                   softmax=False,
                                                   n_classes=-1,
                                                   with_calibration=True)
        specific_model.fit_classifier(augmented_data.X_train_augmented,
                                      y_train)
        specific_model.fit_calibration(augmented_data.X_calib_augmented,
                                       augmented_data.y_calib_nhot_augmented,
                                       target_classes)

        log_lrs = []
        specific_log_lrs = []
        for sample in samples_to_evaluate:
            log_lrs.append(
                np.log10(model.predict_lrs([sample], target_classes))[0][-1])
            specific_log_lrs.append(
                np.log10(specific_model.predict_lrs([sample],
                                                    target_classes))[0][-1])
        plot_multiclass_comparison(specific_log_lrs,
                                   log_lrs, [
                                       'blood+nas+vag', 'menstr',
                                       'indication of menstr', 'blood', 's***n'
                                   ],
                                   'specific_hypothesis',
                                   save_path,
                                   x_label='log(LR)',
                                   y_label='log(LR) H2: blood')

    compare_to_multiclass(X_single,
                          y_single,
                          target_classes,
                          tc,
                          model,
                          samples_to_evaluate,
                          save_path=save_path,
                          alternative_target=None)

    # plot the coefficients
    plot_coefficient_importances(model,
                                 target_classes,
                                 present_markers,
                                 label_encoder,
                                 savefig=os.path.join(
                                     save_path,
                                     'coefs_{}_{}'.format(prior, model_name)),
                                 show=None)

    for t in range(len(target_classes)):
        intercept, coefficients = model.get_coefficients(
            t, target_classes[t].squeeze())
        all_coefficients = np.append(intercept, coefficients).tolist()
        all_coefficients_str = [str(coef) for coef in all_coefficients]
        all_coefficients_strr = [
            coef.replace('.', ',') for coef in all_coefficients_str
        ]
        present_markers.insert(0, 'intercept')

        with open(os.path.join(
                save_path, 'coefs_{}_{}.csv'.format(tc[t].replace('/', '_'),
                                                    model_name)),
                  mode='w') as coefs:
            coefs_writer = csv.writer(coefs,
                                      delimiter=';',
                                      quotechar='"',
                                      quoting=csv.QUOTE_MINIMAL)
            coefs_writer.writerow(present_markers)
            coefs_writer.writerow(all_coefficients_strr)
示例#3
0
def test_augment_data():
    """
    Tests that for given priors, the time that a cell type occurs
    is the same as the prior infers.
    """

    from_penile = False
    mle = MultiLabelEncoder(len(single_cell_types))
    tc = ['Skin', 'Vaginal.mucosa and/or Menstrual.secretion']

    X_single, y_nhot_single, n_celltypes, n_features, n_per_celltype, label_encoder, present_markers, present_celltypes = \
        get_data_per_cell_type(filename='../Datasets/Dataset_NFI_rv.xlsx', single_cell_types=single_cell_types, remove_structural=True)
    y_single = mle.transform_single(mle.nhot_to_labels(y_nhot_single))
    target_classes = string2vec(tc, label_encoder)

    N_SAMPLES_PER_COMBINATION = [11, 22, 33]
    priors = [
        [1, 1, 1, 1, 1, 1, 1, 1],  # uniform priors
        [10, 1, 1, 1, 1, 1, 1, 1],  # cell type 1 occurs 10 times more often
        [1, 10, 10, 10, 10, 10, 10, 10],
    ]  # cell type 1 occurs 10 times less often

    for N_SAMPLES in N_SAMPLES_PER_COMBINATION:
        print(N_SAMPLES)
        for prior in priors:
            print(prior)
            X_augmented, y_nhot = augment_data(X_single,
                                               y_single,
                                               n_celltypes,
                                               n_features,
                                               N_SAMPLES,
                                               label_encoder,
                                               prior,
                                               binarize=True,
                                               from_penile=from_penile)

            occurrence_celltypes = np.sum(y_nhot, axis=0)
            if len(np.unique(prior)) == 1 or prior is None:
                assert all(occurrence == occurrence_celltypes.tolist()[0]
                           for occurrence in occurrence_celltypes.tolist())

            else:
                counts = {
                    prior.count(value): value
                    for value in list(set(prior))
                }
                relevant_prior = counts[1]
                counts.pop(1)
                value_other_priors = list(counts.values())[0]

                index_of_relevant_prior = prior.index(relevant_prior)
                occurrence_of_relevant_prior = occurrence_celltypes[
                    index_of_relevant_prior]

                relative_occurrence_of_relevant_celltype = float(
                    occurrence_of_relevant_prior / y_nhot.shape[0])
                relative_occurrence_without_celltype = float(
                    (y_nhot.shape[0] - occurrence_of_relevant_prior) /
                    y_nhot.shape[0])

                if relevant_prior != 1:
                    assert round(relative_occurrence_of_relevant_celltype, 5) == \
                           round(relative_occurrence_without_celltype * relevant_prior, 5)
                else:
                    assert round(relative_occurrence_of_relevant_celltype * value_other_priors, 5) == \
                           round(relative_occurrence_without_celltype, 5)
示例#4
0
def nfold_analysis(nfolds, tc, savepath, from_penile: bool, models_list,
                   softmax_list: List[bool], priors_list: List[List],
                   binarize_list: List[bool], test_size: float,
                   calibration_size: float, remove_structural: bool,
                   calibration_on_loglrs: bool, nsamples: Tuple[int, int,
                                                                int]):

    mle = MultiLabelEncoder(len(single_cell_types))
    baseline_prior = str(priors_list[0])

    # ======= Load data =======
    X_single, y_nhot_single, n_celltypes, n_features, n_per_celltype, label_encoder, present_markers, present_celltypes = \
        get_data_per_cell_type(single_cell_types=single_cell_types, remove_structural=remove_structural)
    y_single = mle.transform_single(mle.nhot_to_labels(y_nhot_single))
    target_classes = string2vec(tc, label_encoder)

    outer = tqdm(total=nfolds,
                 desc='{} folds'.format(nfolds),
                 position=0,
                 leave=False)
    for n in range(nfolds):
        # n = n + (nfolds * run)
        print(n)

        # ======= Initialize =======
        lrs_for_model_in_fold = OrderedDict()
        emtpy_numpy_array = np.zeros((len(binarize_list), len(softmax_list),
                                      len(models_list), len(priors_list)))
        accuracies_train_n, accuracies_test_n, accuracies_test_as_mixtures_n, accuracies_mixtures_n, accuracies_single_n,\
        cllr_test_n, cllr_test_as_mixtures_n, cllr_mixtures_n, coeffs = [dict() for i in range(9)]

        for target_class in target_classes:
            target_class_str = vec2string(target_class, label_encoder)

            accuracies_train_n[target_class_str] = emtpy_numpy_array.copy()
            accuracies_test_n[target_class_str] = emtpy_numpy_array.copy()
            accuracies_test_as_mixtures_n[
                target_class_str] = emtpy_numpy_array.copy()
            accuracies_mixtures_n[target_class_str] = emtpy_numpy_array.copy()
            accuracies_single_n[target_class_str] = emtpy_numpy_array.copy()

            cllr_test_n[target_class_str] = emtpy_numpy_array.copy()
            cllr_test_as_mixtures_n[target_class_str] = emtpy_numpy_array.copy(
            )
            cllr_mixtures_n[target_class_str] = emtpy_numpy_array.copy()
            coeffs[target_class_str] = np.zeros(
                (len(binarize_list), 1, X_single[0].shape[1] + 1,
                 len(priors_list)))
        # ======= Split data =======
        X_train, X_test, y_train, y_test = train_test_split(
            X_single, y_single, stratify=y_single, test_size=test_size)
        X_train, X_calib, y_train, y_calib = train_test_split(
            X_train, y_train, stratify=y_train, test_size=calibration_size)

        for i, binarize in enumerate(binarize_list):
            X_mixtures, y_nhot_mixtures, mixture_label_encoder = read_mixture_data(
                n_celltypes,
                label_encoder,
                binarize=binarize,
                remove_structural=remove_structural)

            # ======= Augment data for all priors =======
            augmented_data = OrderedDict()
            for p, priors in enumerate(priors_list):
                augmented_data[str(priors)] = augment_splitted_data(
                    X_train,
                    y_train,
                    X_calib,
                    y_calib,
                    X_test,
                    y_test,
                    y_nhot_mixtures,
                    n_celltypes,
                    n_features,
                    label_encoder,
                    priors,
                    binarize_list,
                    from_penile,
                    nsamples,
                    disallowed_mixtures=None)

            # ======= Transform data accordingly =======
            if binarize:
                X_test_transformed = [[
                    np.where(X_test[i][j] > 150, 1, 0)
                    for j in range(len(X_test[i]))
                ] for i in range(len(X_test))]
                X_test_transformed = combine_samples(X_test_transformed)
            else:
                X_test_transformed = combine_samples(X_test) / 1000

            for j, softmax in enumerate(softmax_list):
                for k, model_calib in enumerate(models_list):
                    print(model_calib[0])

                    # ======= Calculate LRs before and after calibration =======
                    key_name = bool2str_binarize(
                        binarize) + '_' + bool2str_softmax(
                            softmax) + '_' + str(model_calib)
                    if not model_calib[1]:
                        key_name += '_uncal'
                    key_name_per_fold = str(n) + '_' + key_name
                    model, lrs_before_calib, lrs_after_calib, y_test_nhot_augmented, \
                    lrs_before_calib_test_as_mixtures, lrs_after_calib_test_as_mixtures, y_test_as_mixtures_nhot_augmented, \
                    lrs_before_calib_mixt, lrs_after_calib_mixt = \
                        calculate_lrs_for_different_priors(augmented_data, X_mixtures, target_classes, baseline_prior,
                                                           present_markers, model_calib, mle, label_encoder, key_name_per_fold,
                                                           softmax, calibration_on_loglrs, savepath)

                    lrs_for_model_in_fold[key_name] = LrsBeforeAfterCalib(
                        lrs_before_calib, lrs_after_calib,
                        y_test_nhot_augmented,
                        lrs_before_calib_test_as_mixtures,
                        lrs_after_calib_test_as_mixtures,
                        y_test_as_mixtures_nhot_augmented,
                        lrs_before_calib_mixt, lrs_after_calib_mixt,
                        y_nhot_mixtures)

                    ## Check which samples the method makes an error with
                    # indices_values_above_one = np.argwhere(lrs_for_model_in_fold['[1, 1, 1, 1, 1, 1, 1, 1]'].lrs_before_calib > 1)[:, 0]
                    # indices_values_below_one = np.argwhere(lrs_for_model_in_fold['[1, 1, 1, 1, 1, 1, 1, 1]'].lrs_before_calib < 1)[:, 0]
                    # labels = np.max(np.multiply(augmented_data['[1, 1, 1, 1, 1, 1, 1, 1]'].y_test_nhot_augmented, target_class), axis=1)
                    # indices_fp = np.argwhere(labels[indices_values_above_one] == 0)
                    # indices_fn = np.argwhere(labels[indices_values_below_one] == 1)
                    # augmented_data['[1, 1, 1, 1, 1, 1, 1, 1]'].y_test_nhot_augmented[indices_values_above_one][indices_fp][:, 0, :]
                    # augmented_data['[1, 1, 1, 1, 1, 1, 1, 1]'].y_test_nhot_augmented[indices_values_below_one][indices_fn][:, 0, :]

                    # ======= Calculate performance metrics =======
                    for t, target_class in enumerate(target_classes):
                        for p, priors in enumerate(priors_list):
                            str_prior = str(priors)
                            target_class_str = vec2string(
                                target_class, label_encoder)

                            accuracies_train_n[target_class_str][
                                i, j, k,
                                p] = calculate_accuracy_all_target_classes(
                                    augmented_data[str_prior].
                                    X_train_augmented,
                                    augmented_data[str_prior].
                                    y_train_nhot_augmented, target_classes,
                                    model[str_prior], mle)[t]
                            accuracies_test_n[target_class_str][
                                i, j, k,
                                p] = calculate_accuracy_all_target_classes(
                                    augmented_data[baseline_prior].
                                    X_test_augmented,
                                    augmented_data[baseline_prior].
                                    y_test_nhot_augmented, target_classes,
                                    model[str_prior], mle)[t]
                            accuracies_test_as_mixtures_n[target_class_str][
                                i, j, k,
                                p] = calculate_accuracy_all_target_classes(
                                    augmented_data[baseline_prior].
                                    X_test_as_mixtures_augmented,
                                    augmented_data[baseline_prior].
                                    y_test_as_mixtures_nhot_augmented,
                                    target_classes, model[str_prior], mle)[t]
                            accuracies_mixtures_n[target_class_str][
                                i, j, k,
                                p] = calculate_accuracy_all_target_classes(
                                    X_mixtures, y_nhot_mixtures,
                                    target_classes, model[str_prior], mle)[t]
                            accuracies_single_n[target_class_str][
                                i, j, k,
                                p] = calculate_accuracy_all_target_classes(
                                    X_test_transformed,
                                    mle.inv_transform_single(y_test),
                                    target_classes, model[str_prior], mle)[t]

                            cllr_test_n[target_class_str][i, j, k, p] = cllr(
                                lrs_after_calib[str_prior][:, t],
                                augmented_data[baseline_prior].
                                y_test_nhot_augmented, target_class)
                            cllr_test_as_mixtures_n[target_class_str][
                                i, j, k, p] = cllr(
                                    lrs_after_calib_test_as_mixtures[str_prior]
                                    [:, t], augmented_data[baseline_prior].
                                    y_test_as_mixtures_nhot_augmented,
                                    target_class)
                            cllr_mixtures_n[target_class_str][
                                i, j, k, p] = cllr(
                                    lrs_after_calib_mixt[str_prior][:, t],
                                    y_nhot_mixtures, target_class)
                            if model_calib[0] == 'MLR' and not softmax:
                                # save coefficents
                                intercept, coefficients = model[str(
                                    priors)].get_coefficients(t, target_class)
                                coeffs[target_class_str][i, 0, 0,
                                                         p] = intercept
                                for i_coef, coef in enumerate(coefficients):
                                    coeffs[target_class_str][i, 0, i_coef + 1,
                                                             p] = coef

        outer.update(1)

        # ======= Save lrs and performance metrics =======
        pickle.dump(
            lrs_for_model_in_fold,
            open(
                os.path.join(savepath,
                             'picklesaves/lrs_for_model_in_fold_{}'.format(n)),
                'wb'))

        for t, target_class in enumerate(target_classes):
            target_class_str = vec2string(target_class, label_encoder)
            target_class_save = target_class_str.replace(" ", "_")
            target_class_save = target_class_save.replace(".", "_")
            target_class_save = target_class_save.replace("/", "_")

            pickle.dump(
                accuracies_train_n[target_class_str],
                open(
                    os.path.join(
                        savepath, 'picklesaves/accuracies_train_{}_{}'.format(
                            target_class_save, n)), 'wb'))
            pickle.dump(
                accuracies_test_n[target_class_str],
                open(
                    os.path.join(
                        savepath, 'picklesaves/accuracies_test_{}_{}'.format(
                            target_class_save, n)), 'wb'))
            pickle.dump(
                accuracies_test_as_mixtures_n[target_class_str],
                open(
                    os.path.join(
                        savepath,
                        'picklesaves/accuracies_test_as_mixt_{}_{}'.format(
                            target_class_save, n)), 'wb'))
            pickle.dump(
                accuracies_mixtures_n[target_class_str],
                open(
                    os.path.join(
                        savepath, 'picklesaves/accuracies_mixt_{}_{}'.format(
                            target_class_save, n)), 'wb'))
            pickle.dump(
                accuracies_single_n[target_class_str],
                open(
                    os.path.join(
                        savepath, 'picklesaves/accuracies_single_{}_{}'.format(
                            target_class_save, n)), 'wb'))

            pickle.dump(
                cllr_test_n[target_class_str],
                open(
                    os.path.join(
                        savepath, 'picklesaves/cllr_test_{}_{}'.format(
                            target_class_save, n)), 'wb'))
            pickle.dump(
                cllr_test_as_mixtures_n[target_class_str],
                open(
                    os.path.join(
                        savepath, 'picklesaves/cllr_test_as_mixt_{}_{}'.format(
                            target_class_save, n)), 'wb'))
            pickle.dump(
                cllr_mixtures_n[target_class_str],
                open(
                    os.path.join(
                        savepath, 'picklesaves/cllr_mixt_{}_{}'.format(
                            target_class_save, n)), 'wb'))

            pickle.dump(
                coeffs[target_class_str],
                open(
                    os.path.join(
                        savepath, 'picklesaves/coeffs_{}_{}'.format(
                            target_class_save, n)), 'wb'))
    def analyse_data(self):
        # global master, self.tree, button_load

        model_filename, read_marker_names, names, X_single, n_celltypes_with_penile, \
        n_features, n_per_celltype = self.load_data()

        X = combine_samples(X_single)

        print('data loaded, shape {}. {}'.format(X.shape, X[0, :]))

        n_single_cell_types = 8

        test_data_grouped = []
        predicted_proba_average = []
        predicted_proba_4 = []
        proba_final_top = []
        proba_final_bottom = []
        if read_marker_names != marker_names:
            messagebox.showinfo(
                "Warning",
                "'The marker labels are inconsistent with the trained model, please fix the labels. "
                "The correct labels are: {}. Found {}".format(
                    marker_names, read_marker_names))
            print(
                "'The marker labels are inconsistent with the trained model, please fix the labels. "
                "The correct labels are: {}. Found {}".format(
                    marker_names, read_marker_names))
        # Load the trained model and all classes present in the trained model.
        model = pickle.load(open(model_filename, 'rb'))

        priors_numerator = get_prior(string2index, self.top_variables)
        priors_denominator = get_prior(string2index, self.bottom_variables)

        # for now, target classes are all separate classes and vag muc+menstr secr.
        target_classes_str = list(single_cell_types) + [
            'Vaginal.mucosa and/or Menstrual.secretion'
        ]
        lrs = model.predict_lrs(X,
                                string2vec(target_classes_str, string2index),
                                priors_numerator=priors_numerator,
                                priors_denominator=priors_denominator)
        print(lrs)
        #
        # # classes = pickle.load(open('classes.pkl', 'rb'))
        # # mixture_classes_in_single_cell_type = pickle.load(open('mixture_classes_in_single_cell_type', 'rb'))
        # prob_per_class = get_prob_per_class(X, mixture_classes_in_single_cell_type, model, max_lr=10)
        #
        # print(prob_per_class)
        # print(prob_per_class.shape)
        # # Predict the probabilities for the input data for every trained class.
        # predict_proba = model.predict_lrs(X)
        # # predict_proba = predict_proba.toarray()
        #
        # predicted_proba_4.append(predict_proba)
        # # predicted_proba_average.append(sum(predict_proba) / self.number_of_replicates)
        #
        # proba_list = []
        # LR_prediction_list = []
        # top_list = []
        # bottom_list = []
        # final_list = []
        #
        # # all_cell_types = ['Blank_PCR', 'S***n.fertile', 'Saliva', 'Nasal.mucosa', 'Menstrual.secretion', 'Blood',
        # #                   'S***n.sterile', 'Vaginal.mucosa', 'Skin', 'Skin.penile']
        #
        # cell_types_yes_top = [self.single_cell_types[i] for i in
        #                       [i for i, x in enumerate(self.top_variables) if x == 'Always']]
        # cell_types_no_top = [self.single_cell_types[i] for i in
        #                      [i for i, x in enumerate(self.top_variables) if x == 'Never']]
        #
        # cell_types_yes_bottom = [self.single_cell_types[i] for i in
        #                          [i for i, x in enumerate(self.bottom_variables) if x == 'Always']]
        # cell_types_no_bottom = [self.single_cell_types[i] for i in
        #                         [i for i, x in enumerate(self.bottom_variables) if x == 'NEVER']]
        #
        # # TOP PART OF LR
        # for probabilility_4, probability_average in zip(predicted_proba_4, predicted_proba_average):
        #     proba_all_top = []
        #     # Probability for 4 replicates
        #     for probability_single in probabilility_4:
        #         proba_per_class = []
        #         matches_yes_list = []
        #         matches_no_list = []
        #         if len(cell_types_yes_top) != 0:
        #             for single_cell_type in cell_types_yes_top:
        #                 matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s]
        #                 matches_yes_list.append(matches_yes)
        #             flatten_yes = [item for sublist in matches_yes_list for item in sublist]
        #             new_list_yes = sorted(set(flatten_yes))
        #             dup_list_yes = []
        #             for i in range(len(new_list_yes)):
        #                 if (flatten_yes.count(new_list_yes[i]) > len(cell_types_yes_top) - 1):
        #                     dup_list_yes.append(new_list_yes[i])
        #
        #         else:
        #             for single_cell_type in self.single_cell_types:
        #                 matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s]
        #                 matches_yes_list.append(matches_yes)
        #             dup_list_yes = [item for sublist in matches_yes_list for item in sublist]
        #
        #         for single_cell_type_no in cell_types_no_top:
        #             matches_no = [i for i, s in enumerate(classes) if single_cell_type_no in s]
        #             matches_no_list.append(matches_no)
        #         flatten_no = [item for sublist in matches_no_list for item in sublist]
        #
        #         difference_top_list = list(set(list(set(dup_list_yes))) - set(list(set(flatten_no))))
        #
        #         for class_index in difference_top_list:
        #             proba_per_class.append(probability_single[class_index])
        #         proba_all_top.append(sum(proba_per_class))
        #
        #     # Probability for average of 4 replicates
        #     proba_per_class = []
        #     matches_yes_list = []
        #     matches_no_list = []
        #     if len(cell_types_yes_top) != 0:
        #         for single_cell_type in cell_types_yes_top:
        #             matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s]
        #             matches_yes_list.append(matches_yes)
        #         flatten_yes = [item for sublist in matches_yes_list for item in sublist]
        #         new_list_yes = sorted(set(flatten_yes))
        #         dup_list_yes = []
        #         for i in range(len(new_list_yes)):
        #             if (flatten_yes.count(new_list_yes[i]) > len(cell_types_yes_top) - 1):
        #                 dup_list_yes.append(new_list_yes[i])
        #     else:
        #         for single_cell_type in self.single_cell_types:
        #             matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s]
        #             matches_yes_list.append(matches_yes)
        #         dup_list_yes = [item for sublist in matches_yes_list for item in sublist]
        #
        #     for single_cell_type_no in cell_types_no_top:
        #         matches_no = [i for i, s in enumerate(classes) if single_cell_type_no in s]
        #         matches_no_list.append(matches_no)
        #     flatten_no = [item for sublist in matches_no_list for item in sublist]
        #
        #     difference_top_list = list(set(list(set(dup_list_yes))) - set(list(set(flatten_no))))
        #
        #     for class_index in difference_top_list:
        #         proba_per_class.append(probability_average[class_index])
        #     proba_all_top.append(sum(proba_per_class))
        #     proba_final_top.append(proba_all_top)
        #
        # # BOTTOM PART OF LR
        # for probabilility_4, probability_average in zip(predicted_proba_4, predicted_proba_average):
        #     proba_all_bottom = []
        #     # Probability for 4 replicates
        #     for probability_single in probabilility_4:
        #         proba_per_class = []
        #         matches_yes_list = []
        #         matches_no_list = []
        #         if len(cell_types_yes_bottom) != 0:
        #             for single_cell_type in cell_types_yes_bottom:
        #                 matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s]
        #                 matches_yes_list.append(matches_yes)
        #             flatten_yes = [item for sublist in matches_yes_list for item in sublist]
        #             new_list_yes = sorted(set(flatten_yes))
        #             dup_list_yes = []
        #             for i in range(len(new_list_yes)):
        #                 if (flatten_yes.count(new_list_yes[i]) > len(cell_types_yes_bottom) - 1):
        #                     dup_list_yes.append(new_list_yes[i])
        #         else:
        #             for single_cell_type in self.single_cell_types:
        #                 matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s]
        #                 matches_yes_list.append(matches_yes)
        #             dup_list_yes = [item for sublist in matches_yes_list for item in sublist]
        #
        #         for single_cell_type_no in cell_types_no_bottom:
        #             matches_no = [i for i, s in enumerate(classes) if single_cell_type_no in s]
        #             matches_no_list.append(matches_no)
        #         flatten_no = [item for sublist in matches_no_list for item in sublist]
        #
        #         difference_bottom_list = list(set(list(set(dup_list_yes))) - set(list(set(flatten_no))))
        #
        #         for class_index in difference_bottom_list:
        #             proba_per_class.append(probability_single[class_index])
        #         proba_all_bottom.append(sum(proba_per_class))
        #
        #     # Probability for average of 4 replicates
        #     proba_per_class = []
        #     matches_yes_list = []
        #     matches_no_list = []
        #     if len(cell_types_yes_bottom) != 0:
        #         for single_cell_type in cell_types_yes_bottom:
        #             matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s]
        #             matches_yes_list.append(matches_yes)
        #         flatten_yes = [item for sublist in matches_yes_list for item in sublist]
        #         new_list_yes = sorted(set(flatten_yes))
        #         dup_list_yes = []
        #         for i in range(len(new_list_yes)):
        #             if (flatten_yes.count(new_list_yes[i]) > len(cell_types_yes_bottom) - 1):
        #                 dup_list_yes.append(new_list_yes[i])
        #     else:
        #         for single_cell_type in self.single_cell_types:
        #             matches_yes = [i for i, s in enumerate(classes) if single_cell_type in s]
        #             matches_yes_list.append(matches_yes)
        #         dup_list_yes = [item for sublist in matches_yes_list for item in sublist]
        #
        #     for single_cell_type_no in cell_types_no_bottom:
        #         matches_no = [i for i, s in enumerate(classes) if single_cell_type_no in s]
        #         matches_no_list.append(matches_no)
        #     flatten_no = [item for sublist in matches_no_list for item in sublist]
        #
        #     difference_bottom_list = list(set(list(set(dup_list_yes))) - set(list(set(flatten_no))))
        #
        #     for class_index in difference_bottom_list:
        #         proba_per_class.append(probability_average[class_index])
        #     proba_all_bottom.append(sum(proba_per_class))
        #     proba_final_bottom.append(proba_all_bottom)
        #
        # # Calculate the LR
        # for proba_one_top, proba_one_bottom in zip(proba_final_top, proba_final_bottom):
        #     LR_list = []
        #     top_list_temp = []
        #     bottom_list_temp = []
        #     final_list_temp = []
        #     for prob_one_top, prob_one_bottom in zip(proba_one_top, proba_one_bottom):
        #         top_list_temp.append(np.sum(prob_one_top))
        #         bottom_list_temp.append(np.sum(prob_one_bottom))
        #         LR_list.append(np.log10(np.sum(prob_one_top) / np.sum(prob_one_bottom)))
        #         final_list_temp.append([np.sum(prob_one_top), np.sum(prob_one_bottom),
        #                                 np.log10(np.sum(prob_one_top) / np.sum(prob_one_bottom))])
        #
        #     final_list.append(final_list_temp)
        #     top_list.append(top_list_temp)
        #     bottom_list.append(bottom_list_temp)
        #     LR_prediction_list.append(LR_list)

        # # Create a window that shows the output table with the LR's
        # master = Tk()
        # app = FullScreenApp(master)
        #
        # frame = Frame(master)
        # frame.pack()
        #
        # neutral_list_top = [x for x in self.single_cell_types if x not in cell_types_yes_top]
        # neutral_list_top = [x for x in neutral_list_top if x not in cell_types_no_top]
        #
        # neutral_list_bottom = [x for x in self.single_cell_types if x not in cell_types_yes_bottom]
        # neutral_list_bottom = [x for x in neutral_list_bottom if x not in cell_types_no_bottom]
        #
        # # LR table
        # text = Text(frame, width=200, height=1)
        # text.insert('1.0', cell_types_yes_top)
        # text.insert('1.0', 'Top yes: ')
        # text.pack(side=TOP)
        # text1 = Text(frame, width=200, height=1)
        # text1.insert('1.0', cell_types_no_top)
        # text1.insert('1.0', 'Top no: ')
        # text1.pack(side=TOP)
        # text2 = Text(frame, width=200, height=1)
        # text2.insert('1.0', neutral_list_top)
        # text2.insert('1.0', 'Top neutral: ')
        # text2.pack(side=TOP)
        # text3 = Text(frame, width=200, height=1)
        # text3.insert('1.0', cell_types_yes_bottom)
        # text3.insert('1.0', 'Bottom yes: ')
        # text3.pack(side=TOP)
        # text4 = Text(frame, width=200, height=1)
        # text4.insert('1.0', cell_types_no_bottom)
        # text4.insert('1.0', 'Bottom no: ')
        # text4.pack(side=TOP)
        # text5 = Text(frame, width=200, height=1)
        # text5.insert('1.0', neutral_list_bottom)
        # text5.insert('1.0', 'Bottom neutral: ')
        # text5.pack(side=TOP)
        #
        # labels = ['Probability top', 'Probability bottom', 'Log(10) LR']
        # labels_csv = ['Probability top', 'Probability bottom', 'Log(10) LR', 'Top yes', 'Top no', 'Top neutral',
        #               'Bottom no', 'Bottom yes', 'Bottom neutral']
        #
        # number_columns = range(1, (len(labels) + 2))
        #
        # self.tree = ttk.Treeview(frame, columns=number_columns, height=20, show="headings")
        # self.tree.pack(side=TOP)
        #
        # self.create_table(labels)
        #
        # i = 1
        # j = 0
        # values = []
        #
        # temp_list_grouped = []
        # for grouped_LR in final_list:
        #
        #     temp_value = []
        #     for val in grouped_LR:
        #         val = [round(v, 2) for v in val]
        #         if i % (self.number_of_replicates + 1) == 0:
        #             index = 'Average'
        #         else:
        #             index = names[j]
        #             j = j + 1
        #
        #         values.append(index)
        #         if i % (self.number_of_replicates + 1) == 0:
        #             self.tree.insert('', 'end', values=(
        #                 index, val[0], val[1], val[2]), tags=('average',))
        #         else:
        #             self.tree.insert('', 'end', values=(
        #                 index, val[0], val[1], val[2]), tags=('normal',))
        #         i = i + 1
        #         temp_value.append(val)
        #     temp_list_grouped.append(temp_value)
        #
        # self.tree.tag_configure('average', background='lightblue')
        #
        # frames = []
        # for LR_grouped in temp_list_grouped:
        #     df = pd.DataFrame.from_records(LR_grouped, columns=labels)
        #     frames.append(df)
        #
        # # Save the LR selection in a dataframe
        # d = {'Top_yes': [cell_types_yes_top], 'Top_no': [cell_types_no_top], 'Top_neutral': [neutral_list_top],
        #      'Bottom_yes': [cell_types_yes_bottom], 'Bottom_no': [cell_types_no_bottom],
        #      'Bottom_neutral': [neutral_list_bottom]}
        # df_LR_types = pd.DataFrame(data=d, columns=['Top_yes', 'Top_no', 'Top_neutral', 'Bottom_yes', 'Bottom_no',
        #                                             'Bottom_neutral'])
        # df_LR_types = df_LR_types.set_index('Top_yes')
        #
        # # Save LR results in a dataframe
        # result = pd.concat(frames)
        # result['Sample_name'] = values
        # result.set_index('Sample_name', inplace=True)
        #
        # # Save the results LR dataframe in a csv file.
        # try:
        #     with open(self.save_filename + '.csv', 'w') as f:
        #         result.to_csv(f)
        #     with open(self.save_filename + '.csv', 'a') as f:
        #         df_LR_types.to_csv(f)
        # except IOError:
        #     sys.exit()

        # button_load = Button(master, command=self.restart_program, text="Restart", height=2, width=15)
        # button_load.pack(side=TOP)

        mainloop()