예제 #1
0
def params_prauc_tables(h5d_fl1, h5d_fl2, curvetype, kfolds, params_od, mix, strata, trec):
    """Area Under the Curve(AUC) paired with table of parameters for PR curve.

        # # #  Make proper Definition here # # #

    """
    # Selecting whether the resaults should be retured in binary(i.e. Trueth-Table)...
    # ...or multi-class value form.
    if curvetype == 'multiclass':
        binary = True
    else:
        binary = False

    # Beginning AUC-Params table building.
    res_lst = list()

    #  Loading data in a convenient form.
    for params_lst, params_path in zip(
        param_comb.ParamGridIter(params_od, 'list'),
            param_comb.ParamGridIter(params_od, 'path')):

        # Defining list for AUC values storage. For this loop.
        auc_values = list()

        if params_lst[0] > params_lst[1]:

            if mix:

                pred_scores, expd_y, pred_y = rfse_multiclass_multimeasure_res(
                    h5d_fl1, h5d_fl2, kfolds, params_path, binary=binary, strata=strata
                )

            else:

                pred_scores, expd_y, pred_y = multiclass_res(
                    h5d_fl1, kfolds, params_path, binary=binary, strata=strata
                )

            # NOTE: Crossckecking and replacing the class-tags of the experiment to virtual...
            # ...class tags refering to the index of the np.unique(expd_y) vector in order...
            # ...to ease the calculations of the curves.
            tags2idx_ref = np.unique(expd_y)
            i_fix = 0
            if tags2idx_ref[0] > 0:
                i_fix = 1
            for i, tg in enumerate(tags2idx_ref):
                expd_y[np.where(expd_y == tg)] = i + i_fix
                pred_y[np.where(pred_y == tg)] = i + i_fix

            # Selecting the case and calculating the precision recall curves.
            if curvetype == 'multiclass':

                # NOTE: Option 'is_truth_tbl' is critical to be selected correctly depending...
                # ...on the input.
                prec, recl, t = mx.pr_curve(
                    expd_y, pred_scores, full_curve=True, is_truth_tbl=True
                )

                # Interpolated at 11-Recall-Levels.
                prec, recl = mx.reclev11_max(prec, recl, trec=trec)

            elif curvetype == 'multiclass_macro':

                # NOTE: Option 'unknow_class' is critical to be selected correctly depending...
                # ...on the input.
                prec, recl, t = mx.pr_curve_macro(
                    expd_y, pred_y, pred_scores, full_curve=True,
                )

                # Interpolated at 11-Recall-Levels.
                prec, recl = mx.reclev11_max(prec, recl, trec=trec)

            elif curvetype == 'onevsall':

                # Finding unique genres.
                gnr_tgs = np.unique(expd_y)

                # Precsion and Recall scores lists of the PR curve per genre.
                prec_lst = list()
                recl_lst = list()

                # Calculating AUC per genre tag.
                for gnr in gnr_tgs:

                    if mix:

                        pred_scores, expd_y, pred_y = onevsall_multimeasure_res(
                            h5d_fl1, h5d_fl2, gnr, kfolds, params_path
                        )

                    else:

                        pred_scores, expd_y, pred_y = onevsall_res(
                            h5d_fl1, gnr, kfolds, params_path
                        )

                    # NOTE: Option 'is_truth_tbl' is critical to be selected correctly depending...
                    # ...on the input.
                    prec_val, recl_val, t = mx.pr_curve(
                        expd_y, pred_scores, full_curve=True, is_truth_tbl=False
                    )

                    # Interpolated at 11-Recall-Levels.
                    prec_val, recl_val = mx.reclev11_max(prec_val, recl_val, trec=trec)

                    # Keeping Precsion and Recall scores of the PR curve per genre.
                    prec_lst.append(prec)
                    recl_lst.append(recl)

                # Calculating the PR Averaged Macro Curves values for 1-vs-All case.
                prec = np.mean(np.vstack(prec_lst), axis=0)
                recl = np.mean(np.vstack(recl_lst), axis=0)

            else:
                raise Exception('Invalide curvetype argument value.')

            # Saving the AUC value and extending parameters list with AUC(s).
            try:
                params_lst.extend([mx.auc(recl, prec)])
            except:
                print "Warning:", params_path, "PR AUC is for these params has set to 0.0"
                params_lst.extend([0.0])

            # Appending the parameters list together with their respective AUC(s).
            res_lst.append(params_lst)

    # Stacking and returning the data collected in a 2D array. Last column contain the AUC for...
    # ...every parameters values possible combination.
    return np.vstack(res_lst)
예제 #2
0
def PRConf_table(h5d_fl1, h5d_fl2, kfolds, params_path, mix, strata, prereccon=0):
    """Precision Recall Tables and Contigency tables from H5D files.

        ### Make proper Definition here ###

    """

    # Beginning Contingency table building
    if mix:
        rfse_data = rfse_multiclass_multimeasure_res(
            h5d_fl1, h5d_fl2, kfolds, params_path, binary=False, strata=strata
        )

    else:
        rfse_data = multiclass_res(
            h5d_fl1, kfolds, params_path, binary=False, strata=strata
        )

    # 3rd element contain predicted y values list.
    pred_y = rfse_data[2]

    # 2rd element contain predicted y values list.
    exp_y = rfse_data[1]

    # Getting the expected classes.
    exp_cls_tags_set = np.unique(exp_y)

    # Calculating contigency table.
    conf_mtrx = mx.seq_contingency_table(
        exp_y, pred_y, exp_cls_tags_set=exp_cls_tags_set, arr_type=np.int32
    )

    if prereccon in [0, 1]:
        # Calculating precision recall scores.

        # Getting the number of samples per class. Zero tag is inlcuded.
        smpls_per_cls = np.bincount(np.array(exp_y, dtype=np.int))

        # Keeping from 1 to end array in case the expected class tags start with above zero values.
        if smpls_per_cls[0] == 0 and exp_cls_tags_set[0] > 0:
            smpls_per_cls = smpls_per_cls[1::]
        elif smpls_per_cls[0] > 0 and exp_cls_tags_set[0] == 0:
            pass  # same as --> smpls_per_cls = smpls_per_cls
            # Anythig else should rase an Exception.
        else:
            raise Exception("Samples count in zero bin is different to the expected class tag cnt!")

        # Calculating Precision per class.
        precisions = [
            dg / float(pred_docs)
            for dg, pred_docs in zip(np.diag(conf_mtrx), np.sum(conf_mtrx, axis=1))
            if pred_docs > 0
        ]

        # Calculating Recall per class.
        recalls = [
            dg / float(splpc)
            for dg, splpc in zip(np.diag(conf_mtrx), smpls_per_cls)
            if splpc > 0
        ]

        # This funciton works only for the mx.contingency_table() output.
        # pr_tbl = mx.precision_recall_scores(conf_mtrx)

        pr_tbl = [precisions, recalls]

    if prereccon in [0, 2]:
        col_sums = conf_mtrx.sum(axis=0)
        conf_mtrx = np.vstack((conf_mtrx, col_sums))
        # conf_percent = np.divide(conf_mtrx, np.bincount(expected_y)) * 100

    if prereccon == 0:
        # Returning...
        return (pr_tbl, conf_mtrx)
    elif prereccon == 1:
        # Returning...
        return pr_tbl
    elif prereccon == 2:
        # Returning...
        return conf_mtrx
    else:
        raise Exception("Returning mode 'prereccon' variable invalid value. Valid values {0,1,2}.")
예제 #3
0
    elif comb_val[3] == 'Cosine' or comb_val[3] == '':
        h5d_fl1 = tb.open_file(h5d_fl + '.h5', 'r')

    else:
        raise Exception("Option: " + comb_val[3] + " is not valid for Measure Option")

    # Getting the predictions
    if comb_val[3] == 'Comb':

        # Building the parapmeters path
        params_path = plist2ppath(comb_val[4], ensbl=comb_val[0])

        pred_scores, expd_y, pred_y = rfse_multiclass_multimeasure_res(
            # h5d_fl1, h5d_fl2, kfolds, params_path, comb_val[4][2],
            # genre_tag=None, binary=True, strata=None
            h5d_fl1, h5d_fl2, kfolds, params_path, binary=False, strata=None
            #  binary=False <- for Micro
        )

    else:

        # Building the parapmeters path
        params_path = plist2ppath(comb_val[4], ensbl=comb_val[0])

        pred_scores, expd_y, pred_y = multiclass_res(
            h5d_fl1, kfolds, params_path, binary=False, strata=None
        )

    # Closing the h5d files.
    if comb_val[3] == 'Comb':
        h5d_fl1.close()