示例#1
0
    def pr_curve(i):
        label = labels[i]
        statistics_l = Statistics()
        print('Doing label {}'.format(label))

        for train_idx, valid_idx in folds:
            rng = np.random.RandomState()
            rng.seed(seeds[i])
            training_fold = developement_df.loc[train_idx, ]
            training_fold = training_fold.reset_index(drop=True)
            validation_fold = developement_df.loc[valid_idx, ]
            validation_fold = validation_fold.reset_index(drop=True)
            base_estimators = make_classifiers(method, balanced, labels, random_state=rng)

            # Find the best params, then do a final proper calibration.
            base_estimator = base_estimators[label]
            estimator = RandomizedSearchCV(
                estimator=base_estimator, param_distributions=params,
                n_iter=60, scoring='f1', cv=3, random_state=rng,
                error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs',
                refit=True
            )

            # Set up the vectorizer for the bag-of-words representation
            if vectorizer_method == 'tf-idf':
                vectorizer = TfidfVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True,
                    sublinear_tf=False, max_df=1.0, min_df=0
                )
                vectorizer.fit(training_fold['terms'].values)
            elif vectorizer_method == 'count':
                vectorizer = CountVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True
                )
                vectorizer.fit(training_fold['terms'].values)

            # Fit an evaluate the performance of the classifier.
            x_train = vectorizer.transform(training_fold['terms'].values)
            y_train = np.asarray(training_fold[label].values, dtype=int)

            x_valid = vectorizer.transform(validation_fold['terms'].values)
            y_valid = np.asarray(validation_fold[label].values, dtype=int)

            estimator.fit(x_train, y_train)

            for t in thresholds:
                y_pred = [int(p[1] >= t) for p in estimator.predict_proba(x_valid)]
                precision = precision_score(y_valid, y_pred, labels=[0, 1], pos_label=1)
                recall = recall_score(y_valid, y_pred, labels=[0, 1], pos_label=1)
                f1 = f1_score(y_valid, y_pred, labels=[0, 1], pos_label=1)
                statistics_l.update_statistics(label=t, s_type='Precision', data=precision)
                statistics_l.update_statistics(label=t, s_type='Recall', data=recall)
                statistics_l.update_statistics(label=t, s_type='F1-Score', data=f1)

        statistics_l.frame()['reaction'] = label
        return statistics_l
示例#2
0
    def do_iteration(i):
        print("Iteration " + str(i+1))
        dev_df = train.copy(deep=True)
        test_df = test.copy(deep=True)
        rng = np.random.RandomState()
        rng.seed(seeds[i])
        folds_i = iterative_stratification(dev_df, labels, cv_folds, rng)

        # Create the appropriate statistics container for this iteration.
        validation_stats_i = Statistics()
        testing_stats_i = Statistics()

        for train_idx, valid_idx in cv_iterator(folds_i):
            training_fold = dev_df.loc[train_idx, ]
            validation_fold = dev_df.loc[valid_idx, ]

            # shuffle the folds
            training_fold = training_fold.reindex(rng.permutation(training_fold.index))
            validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index))

            stats_valid, stats_test = multi_label_crf(
                labels=labels,
                df_train=training_fold,
                df_valid=validation_fold,
                df_test=test_df,
                binary=binary,
                connectivity='full',
                vectorizer_method=vectorizer_method
            )

            validation_stats_i.merge(stats_valid)
            testing_stats_i.merge(stats_test)

        log.write('Iteration {}\n'.format(i))
        validation_stats_i.write(log, 'a')
        testing_stats_i.write(log, 'a')
        return validation_stats_i, testing_stats_i
示例#3
0
        def do_fold(j):
            print("\tFold " + str(j+1))
            train_idx = folds_i[j][0]
            valid_idx = folds_i[j][1]
            training_fold = developement_df.loc[train_idx, ]
            training_fold = training_fold.reset_index(drop=True)
            validation_fold = developement_df.loc[valid_idx, ]
            validation_fold = validation_fold.reset_index(drop=True)

            # shuffle the folds
            training_stats_i_f = Statistics()
            validation_stats_i_f = Statistics()
            testing_stats_i_f = Statistics()

            # Init the label ranking lists.
            label_pred_proba_train = []
            label_pred_proba_valid = []
            label_pred_proba_test = []

            label_y_train = []
            label_y_valid = []
            label_y_test = []

            # Set up the vectorizer for the bag-of-words representation
            if vectorizer_method == 'tf-idf':
                vectorizer = TfidfVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True,
                    sublinear_tf=True, max_df=1.0, min_df=0
                )
                vectorizer.fit(training_fold['terms'].values)
                alpha = None
                percentile = 100
            elif vectorizer_method == 'count':
                vectorizer = CountVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True
                )
                vectorizer.fit(training_fold['terms'].values)
                alpha = None
                percentile = 100
            else:
                raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method)))

            selectors = generate_selectors(selection, vectorizer.get_feature_names(), dag)
            base_estimators = make_classifiers(method, balanced, labels, selectors, selection, rng)
            for label in sorted(labels):
                print("\t\tFitting for label {}...".format(label))

                # SVMs make the assumption of standardised features. Hence we scale the features
                # avoiding the use of mean to maintain the structure of count sparsity. Scaling
                # May also help with linear model convergence speed.
                x_train_l = vectorizer.transform(training_fold['terms'].values)
                y_train_l = np.asarray(training_fold[label].values, dtype=int)

                x_valid_l = vectorizer.transform(validation_fold['terms'].values)
                y_valid_l = np.asarray(validation_fold[label].values, dtype=int)

                x_test_l = vectorizer.transform(testing_df['terms'].values)
                y_test_l = np.asarray(test_df_i[label].values, dtype=int)

                if scale:
                    x_train_l = mean_center(x_train_l, with_mean=False)
                    x_valid_l = mean_center(x_valid_l, with_mean=False)
                    x_test_l = mean_center(x_test_l, with_mean=False)

                # We generate the folds for randomised search up-front. We hold out one of the folds for
                # Probability calibration so each sampled param set gets calibrated on the same data.
                # This leaves cv_folds-2 folds for randomised search cross-validation.
                # cv_rand = StratifiedKFold(n_splits=3, shuffle=True, random_state=rng)
                base_estimator_l = base_estimators[label]
                fresh_estimator = clone(base_estimator_l)

                # Find the best params, then do a final proper calibration.
                params = sk_generate_params(method, selection)
                estimator_l = RandomizedSearchCV(
                    estimator=base_estimator_l, param_distributions=params,
                    n_iter=60, scoring='f1', cv=3, random_state=rng,
                    error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs',
                    refit=True
                )

                # Test if there's any signal if we permute the labels.
                # Classifier should do poorly if we do so.
                if permute:
                    y_train_l = rng.permutation(y_train_l)

                threshold = 0.5
                estimator_l.fit(x_train_l, y_train_l)
                best_params_l = estimator_l.best_params_

                # Calibrate the random forest with the best hyperparameters.
                if method not in ['lr']:
                    estimator_l = CalibratedClassifierCV(fresh_estimator.set_params(**best_params_l),
                                                         cv=3, method='sigmoid')
                    estimator_l.fit(x_train_l, y_train_l)

                # Evaluate Performance characteristics and test on training to check overfitting.
                y_train_prob_l = estimator_l.predict_proba(x_train_l)
                y_valid_prob_l = estimator_l.predict_proba(x_valid_l)
                y_test_prob_l = estimator_l.predict_proba(x_test_l)
                training_stats_i_f.merge(evaluate_model(y_train_l, y_train_prob_l, label, threshold))
                validation_stats_i_f.merge(evaluate_model(y_valid_l, y_valid_prob_l, label,threshold))

                # Compute independent test data performance
                testing_stats_i_f.merge(evaluate_model(y_test_l, y_test_prob_l, label, threshold))

                # Get label ranking info
                label_pred_proba_train.append([p[1] for p in y_train_prob_l])
                label_pred_proba_valid.append([p[1] for p in y_valid_prob_l])
                label_pred_proba_test.append([p[1] for p in y_test_prob_l])

                label_y_train.append(y_train_l)
                label_y_valid.append(y_valid_l)
                label_y_test.append(y_test_l)

                print(validation_stats_i_f.frame())

            # Compute multi-label performance statistics
            y = np.vstack(zip(*label_y_train))
            y_prob = np.vstack(zip(*label_pred_proba_train))
            training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_valid))
            y_prob = np.vstack(zip(*label_pred_proba_valid))
            validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_test))
            y_prob = np.vstack(zip(*label_pred_proba_test))
            testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            return training_stats_i_f, validation_stats_i_f, testing_stats_i_f
示例#4
0
    labels = get_labels_from_file('data/labels.tsv')

    n = len(labels)
    split_train = {l:0 for l in labels}
    for l in labels:
        split_train[l] = sum(developement_df[l].values)

    split_test = {l:0 for l in labels}
    for l in labels:
        split_test[l] = sum(testing_df[l].values)

    n_samples_train = len(developement_df)
    n_samples_test = len(testing_df)

    # Create the appropriate statistics container for the whole experiment.
    training_stats = Statistics()
    validation_stats = Statistics()
    testing_stats = Statistics()
    seeds = create_seeds(iterations)
    min_class_freq = min(split_train.values())
    cv_folds = min([min_class_freq, cv_folds])
    statistics_objects = []
    best_params = {l: {'score': 0.0, 'params': {}} for l in labels}

    print("Running Supervised Ensemble Classification...")
    # def do_iteration(i):
    for i in range(iterations):
        print("Iteration " + str(i+1))
        rng = np.random.RandomState()
        rng.seed(seeds[i])
        dev_df_i = developement_df.copy(deep=True)
示例#5
0
def evaluate_crf_model(x, y, estimator, labels, uniprot=None, verbose=0):
    y_pred = np.asarray(estimator.predict(x))
    statistics = Statistics()
    statistics.update_statistics('all_labels', 'accuracy', estimator.score(x, y))

    bin_labels = [0, 1]
    for i, l in enumerate(labels):
        y_true_binary_l = y[:, i].astype(int)
        y_pred_binary_l = y_pred[:, i].astype(int)
        label_stats = compute_label_statistics(y_true_binary_l, y_pred_binary_l, labels=bin_labels)
        statistics.update_statistics(l, 'Accuracy', accuracy_score(y_true_binary_l, y_pred_binary_l))
        statistics.update_statistics(l, 'Specifcity', label_stats[1]['specificity'])
        statistics.update_statistics(l, 'Recall', label_stats[1]['sensitivity'])
        statistics.update_statistics(l, 'Precision', label_stats[1]['precision'])
        statistics.update_statistics(l, 'FDR', label_stats[1]['fdr'])
        statistics.update_statistics(l, 'F-Score (beta=0.5)', fbeta_score(
            y_true_binary_l, y_pred_binary_l, beta=0.5, labels=bin_labels, average='binary'
        ))
        statistics.update_statistics(l, 'F-Score (beta=1)', fbeta_score(
            y_true_binary_l, y_pred_binary_l, beta=1.0, labels=bin_labels, average='binary'
        ))
        try:
            roc_auc = roc_auc_score(y_true_binary_l, y_pred_binary_l, average="binary")
            statistics.update_statistics(l, 'ROC-AUC', roc_auc)
        except (ValueError, AssertionError):
            statistics.update_statistics(l, 'ROC-AUC', np.NaN)
        try:
            pr_auc = average_precision_score(y_true_binary_l, y_pred_binary_l, average="binary")
            statistics.update_statistics(l, 'PR-AUC', pr_auc)
        except (ValueError, AssertionError):
            statistics.update_statistics(l, 'PR-AUC', np.NaN)

    if verbose:
        for l in labels:
            statistics.print_statistics(l)
    if uniprot and verbose:
        for u, p1, p2 in zip(uniprot, y, y_pred):
            print("\t\t\tResult for {} \n\t\t\t\tTrue: \t{} ||| Pred: \t{}".format(u, p1, p2))

    return statistics
示例#6
0
def multi_label_crf(
        labels,
        df_train,
        df_valid,
        df_test,
        binary,
        connectivity='full',
        vectorizer_method='count'
    ):
    """
    Do you suffer from acute label correlations? Are your samples a part of more than
    one class? Do you have signs of labels have dependency? If you answered yes to at least
    one of those questions then sign up for structured learning today. For a low monthly
    membership fee of $39.99 you can solve all your multi-label woes!

    @param labels:
    @param df_train:
    @param df_valid:
    @param df_test:
    @param connectivity:
    @param vectorizer_method:

    @return:
    """
    stats_container_valid = Statistics()
    stats_container_test = Statistics()

    if vectorizer_method == 'tf-idf':
        vectorizer_node = TfidfVectorizer(
            stop_words=['go:', '', ' '], binary=binary, lowercase=True, sublinear_tf=False, max_df=1.0, min_df=0)
        vectorizer_node.fit(df_train['terms'].values)
        alpha = None
        percentile = 100
    elif vectorizer_method == 'count':
        vectorizer_node = CountVectorizer(stop_words=['go', '', ' '], binary=binary, lowercase=True)
        vectorizer_node.fit(df_train['terms'].values)
        alpha = None
        percentile = 100
    else:
        raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method)))

    x_node_train, y_train, feature_names, selector_node = prep.select_features(
        df = df_train,
        vectorizer=vectorizer_node,
        feature_col='terms',
        label_col='label',
        select_method=None,
        continuous_col=[],
        alpha=alpha,
        percentile=percentile
    )
    x_node_valid, y_valid = prep.transform_features(
        df=df_valid,
        vectorizer=vectorizer_node,
        selector=selector_node,
        feature_col='terms',
        label_col='label',
        continuous_cols=[]
    )

    y_train = np.asarray([prep.binarise_labels(x, labels) for x in y_train], dtype=int)
    y_valid = np.asarray([prep.binarise_labels(x, labels) for x in y_valid], dtype=int)

    if connectivity == 'full' or connectivity == 'tree':
        n_labels = len(labels)
        edges = np.vstack([x for x in itertools.combinations(range(n_labels), 2)])
        model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='ad3')
    elif connectivity == 'tree':
        edges = chow_liu_tree(y_train)
        model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='max-product')
    else:
        edges = None
        model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='unary')
    x_train = x_node_train.toarray()
    x_valid = x_node_valid.toarray()

    # -------------------- MAKE THE ESTIMATOR -------------------- #
    estimator = OneSlackSSVM(model, max_iter=2, tol=0.001, n_jobs=1)

    # -------------------- LEARN/STATS -------------------- #
    estimator.fit(x_train, y_train)
    stats_container_valid.merge(evaluate_crf_model(x_valid, y_valid, estimator, labels))

    if isinstance(df_test, pd.DataFrame):
        x_node_test, y_test = prep.transform_features(
            df=df_test,
            vectorizer=vectorizer_node,
            selector=selector_node,
            feature_col='terms',
            label_col='label',
            continuous_cols=[]
        )
        y_test = np.asarray([prep.binarise_labels(x, labels) for x in y_test], dtype=int)
        x_test = x_node_test.toarray()
        stats_container_test.merge(evaluate_crf_model(x_test, y_test, estimator, labels))

    # -------------------- RETURN -------------------- #
    if isinstance(df_test, pd.DataFrame):
        return stats_container_valid, stats_container_test
    else:
        return stats_container_valid
示例#7
0
def multi_label_evaluate(y, y_prob, threshold):
    statistics = Statistics()
    y_pred = (y_prob >= threshold).astype(int)
    y_pred_50 = (y_prob >= 0.5).astype(int)

    ranking_loss = label_ranking_loss(y, y_pred)
    lraps = label_ranking_average_precision_score(y, y_pred)
    ranking_loss_50 = label_ranking_loss(y, y_pred_50)
    lraps_50 = label_ranking_average_precision_score(y, y_pred_50)

    f1_macro = f1_score(y, y_pred, average='macro')
    f1_macro_50 = f1_score(y, y_pred_50, average='macro')

    statistics.update_statistics("Multi-Label", "Ranking Loss", ranking_loss)
    statistics.update_statistics("Multi-Label", "Ranking Precision", lraps)
    statistics.update_statistics("Multi-Label", "Ranking Loss (t=0.5)", ranking_loss_50)
    statistics.update_statistics("Multi-Label", "Ranking Precision (t=0.5)", lraps_50)

    statistics.update_statistics("Multi-Label", "Macro F1", f1_macro)
    statistics.update_statistics("Multi-Label", "Macro F1 (t=0.5)", f1_macro_50)

    try:
        auc_macro = roc_auc_score(y, y_pred, average='macro')
        auc_macro_50 = roc_auc_score(y, y_pred_50, average='macro')
        auc_pr_macro = roc_auc_score(y, y_prob, average='macro')

        statistics.update_statistics("Multi-Label", "Macro AUC", auc_macro)
        statistics.update_statistics("Multi-Label", "Macro AUC (t=0.5)", auc_macro_50)
        statistics.update_statistics("Multi-Label", "Macro AUC (Pr)", auc_pr_macro)

    except ValueError:
        statistics.update_statistics("Multi-Label", "Macro AUC", np.NaN)
        statistics.update_statistics("Multi-Label", "Macro AUC (t=0.5)", np.NaN)
        statistics.update_statistics("Multi-Label", "Macro AUC (Pr)", np.NaN)

    return statistics
示例#8
0
def evaluate_model(y, y_prob, label, threshold):
    statistics = Statistics()
    y_pred = (y_prob[:, 1] >= threshold).astype(int)

    bin_labels = [0, 1]
    label_stats = compute_label_statistics(y, y_pred, labels=bin_labels)
    statistics.update_statistics(label, 'F (beta=1)', f1_score(y, y_pred, average='binary', labels=[0,1], pos_label=1))
    statistics.update_statistics(label, 'Specificity', label_stats[1]['specificity'])
    statistics.update_statistics(label, 'Recall', label_stats[1]['sensitivity'])
    statistics.update_statistics(label, 'Precision', label_stats[1]['precision'])
    statistics.update_statistics(label, 'FDR', label_stats[1]['fdr'])

    try:
        statistics.update_statistics(label, 'ROC-AUC', roc_auc_score(y, y_pred, average='weighted'))
    except (ValueError, AssertionError):
        statistics.update_statistics(label, 'ROC-AUC', 0.0)
    try:
        pr_auc = average_precision_score(y, y_pred, average='weighted')
        if str(pr_auc) == 'nan':
            pr_auc = 0.0
        statistics.update_statistics(label, 'PR-AUC', pr_auc)
    except (ValueError, AssertionError):
        statistics.update_statistics(label, 'PR-AUC', 0.0)

    return statistics
示例#9
0
    def do_iteration(data):
    # for i in xrange(iterations):
        i = data[0]
        print("Iteration " + str(i+1))
        train_i = train.copy(deep=True)
        rng = np.random.RandomState()
        rng.seed(seeds[i])
        su_make_dir('llda/models/iteration-{}'.format(i+1))
        folds_i = iterative_stratification(train_i, labels, cv_folds, rng)
        combinations = itertools.combinations(range(0, cv_folds), cv_folds-1)

        # Create the appropriate statistics container for this iteration.
        validation_stats_i = Statistics()
        testing_stats_i = Statistics()

        for n, j in enumerate(combinations):
            # print('\tFold ' + str(n+1))
            su_make_dir('llda/models/iteration-{}/fold-{}'.format(i+1, n+1))
            file_path = 'llda/models/iteration-{}/fold-{}/'.format(i+1, n+1)

            training_folds_j = folds_i[list(j)]
            validation_fold_j = folds_i[[f for f in range(0, cv_folds) if f not in j]]
            assert len(validation_fold_j) == 1

            training_fold = reduce(
                lambda x, y: pd.concat([x, y], ignore_index=True, copy=True),
                training_folds_j[1:],
                training_folds_j[0]
            )
            validation_fold = validation_fold_j[0]

            # shuffle the folds
            if balanced:
                training_fold = training_fold.reindex(rng.permutation(training_fold.index))
                training_fold = prep.reduce_label_bias(training_fold, labels, 'activation', 5, random_state=rng)
            training_fold = training_fold.reindex(rng.permutation(training_fold.index))
            validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index))

            write_tsv(training_fold, file_path + '/train.tsv', test=False)
            write_tsv(validation_fold, file_path + '/valid.tsv', test=True)
            write_tsv(test, file_path + '/test.tsv', test=True)

            # ------------------ CALL JAVA TO LLDA ON THIS PARTAY ----------------- #
            DEVNULL = open(os.devnull, 'w')
            args = [
                'java',
                '-jar',
                '-Xmx2048m',
                'llda/tmt-0.4.0.jar',
                'llda/llda.scala',
                file_path,
                '/train.tsv',
                '/valid.tsv',
                '/test.tsv',
                'model-{}-{}'.format(i+1, n+1),
                '{}-{}'.format(i+1, n+1)
            ]
            p = Popen(args, stdout=DEVNULL, stderr=STDOUT)
            p.wait()

            # Perform evaluation
            validation_proba = np.genfromtxt('llda/results/validation-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:]
            test_proba = np.genfromtxt('llda/results/test-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:]
            labels_j = get_labels_from_model(file_path + '/llda-cvb0-model-{}-{}'.format(1+i, n+1))

            validation_stats_i_j = Statistics()
            testing_stats_i_j = Statistics()
            for l_index, l in enumerate(labels_j):
                y_validation = [p for p in validation_fold[l].values]
                y_proba_validation = [[1-p, p] for p in validation_proba[:, l_index]]
                y_pred_validation = [int(p >= threshold) for p in validation_proba[:, l_index]]

                y_hprd = [p for p in test[l].values]
                y_proba_hprd = [[1-p, p] for p in test_proba[:, l_index]]
                y_pred_hprd = [int(p >= threshold) for p in test_proba[:, l_index]]

                validation_stats_i_j = evaluate_model(
                    y=y_validation,
                    y_pred=y_pred_validation,
                    y_pred_prob=y_proba_validation,
                    label=l,
                    statistics=validation_stats_i_j,
                    verbose=0
                )

                testing_stats_i_j = evaluate_model(
                    y=y_hprd,
                    y_pred=y_pred_hprd,
                    y_pred_prob=y_proba_hprd,
                    label=l,
                    statistics=testing_stats_i_j,
                    verbose=0
                )
            validation_stats_i.merge(validation_stats_i_j)
            testing_stats_i.merge(testing_stats_i_j)

        return validation_stats_i, testing_stats_i
示例#10
0
    # ----------------------------- LOAD DATA ----------------------------------- #
    train, test = prep.prep_data_frames(selection, load_interactome=False)
    labels = get_labels_from_file('data/labels.tsv')
    n = len(labels)

    split_train = {l:0 for l in labels}
    for l in labels:
        split_train[l] = sum(train[l].values)

    split_test = {l:0 for l in labels}
    for l in labels:
        split_test[l] = sum(test[l].values)

    # Create the appropriate statistics container for the whole experiment.
    validation_stats = Statistics()
    testing_stats = Statistics()
    seeds = create_seeds(iterations)

    print("Running Labeled Latent Dirichlet Allocation...")
    def do_iteration(data):
    # for i in xrange(iterations):
        i = data[0]
        print("Iteration " + str(i+1))
        train_i = train.copy(deep=True)
        rng = np.random.RandomState()
        rng.seed(seeds[i])
        su_make_dir('llda/models/iteration-{}'.format(i+1))
        folds_i = iterative_stratification(train_i, labels, cv_folds, rng)
        combinations = itertools.combinations(range(0, cv_folds), cv_folds-1)