Пример #1
0
def calculate(idd, n_jobs, debug):
    """Executes FS and RPS for a given task. Executed by cluster.

    Args:
      idd (int): Jobid. Used to find the right task
      n_jobs (int): Number of parallel jobs made by the
                    random parameter search. Does nothing otherwise.
    """
    # task = Foldnr, mask, clfname, ftlist, fname, randseed
    task = b.loadfile(f"{tmpdirectory}/tasks.json")[
        idd]  # = (foldnr, fstype, args, clfname, randseed) or (foldnr, clfname, randseed)
    foldxy = np.load(f"{tmpdirectory}/folds.pkl", allow_pickle=True)[task[0]]
    df = pd.read_pickle(f"{tmpdirectory}/dataframe.pkl")
    if len(task) == 5:  # Normal procedure with Feature Selection first.
        foldnr, fstype, args, clfname, randseed = task
        ftlist, mask, fname = fs.feature_selection(foldxy, fstype, args, df)  # FS - Done.
    elif len(task) == 3:  # A set featurelist was used.
        foldnr, clfname, randseed = task
        ftlist = b.loadfile(f"{tmpdirectory}/set_fl.json")
        mask = [True if f in ftlist else False for f in df.columns]
        fname = "Set Featurelist"
    else:
        raise ValueError("Incorrect number of arguments in the taskfile: {len(task)} should be 5 or 3")
    scores, best_esti, y_labels, coefs = rps.random_param_search(mask, clfname, foldxy, n_jobs, df, randseed,
                                                                 debug)  ######
    best_esti_params = best_esti.get_params()
    best_esti = (type(best_esti).__name__, best_esti_params)  # Creates readable tuple that can be dumped.
    b.dumpfile([foldnr, scores, best_esti, ftlist, fname, y_labels], f"{tmpdirectory}/task_results/{idd}.json")
Пример #2
0
def plotall_precision_recall(new_plots_dir, exclude_string=None):
    """Similar to plotall_roc() but for precision-recall curves.
    Does not include the option to include files from different programs.
    The rest of the comments are the same.
    """
    fontsize = 18  # Size of the text for the labels and legend.
    plt.figure(figsize=(12.8, 9.6))
    for dirname in os.listdir(new_plots_dir):
        if not os.path.isdir(f"{new_plots_dir}/{dirname}"):
            continue
        elif not exclude_string == None and exclude_string in dirname:
            continue
        for filename in os.listdir(f"{new_plots_dir}/{dirname}"):
            if filename.startswith("results.json"):
                y_true, y_score = [], []
                for sc, be, ft, fn, y_labels in b.loadfile(
                        f"{new_plots_dir}/{dirname}/{filename}").values():
                    y_true.extend(y_labels[0])
                    y_score.extend(y_labels[1])
                precision, recall, thresholds = precision_recall_curve(
                    y_true, y_score)
                plt.plot(recall, precision, label=f"{dirname}")
    plt.xlabel('Recall', fontsize=fontsize)
    plt.ylabel('Precision', fontsize=fontsize)
    plt.legend(loc='best')
    plt.savefig(f"{new_plots_dir}/all_precision_recall")
    plt.show()
Пример #3
0
def getresults():
    """Analyzes the result files in rps_results and
    returns only the ones with the best best_esti_score in each fold.
    """
    results = defaultdict(lambda: [[0]])
    for rfile in os.listdir(f"{tmpdirectory}/task_results"):
        f = b.loadfile(f"{tmpdirectory}/task_results/{rfile}")
        if f[1][0] > results[f[0]][0][0] or f[1][0] == -1:  # Remove this == -1 part
            # For each fold the result with the best best_esti_score is saved
            # If the best_esti_score is -1 it means a set classifier was used.
            results[f[0]] = f[1:]
    b.dumpfile(results, f"results/results.json")
Пример #4
0
def plotall_roc(new_plots_dir, old_plots_dir, exclude_string=None):
    """Experimental function that allows to easily draw ROC figures from different runs into a single file.
    Optionally can also draw plots from specific older files.
    Note that these older files need to have a very specific file format.
    (This function was only added for convenience use to compare results of pig.py with different programs)

    Args:
      new_plots_dir(string): Directory of runs to plot. This directory should
                             contain subdirectories with their runnames as
                             directorynames and those need to contain a "results.json" file.
                             Example: new_plots_dir/gradientboosting42/results.json
      old_plots_dir(string): Directory containing old roc files. Those files should start with "roc-all."
      exclude_string(string): If existing it will ignore subdirectories that contain this string in their name.
                              Example: If you have gradientboosting and neuralnet runs set it to "neural"
                                       to only plot gradientboosting runs.

    Returns:
      Nothing but saves the resulting ROC figure into the new_plots_dir.
    """
    fontsize = 18  # Size of the text for the labels and legend.
    plt.figure(figsize=(12.8, 9.6))
    plt.plot([0, 1], [0, 1], 'k--')
    for dirname in os.listdir(new_plots_dir):
        if not os.path.isdir(f"{new_plots_dir}/{dirname}"):
            continue
        elif not exclude_string == None and exclude_string in dirname:
            continue
        for filename in os.listdir(f"{new_plots_dir}/{dirname}"):
            if filename.startswith("results.json"):
                y_true, y_score = [], []
                for sc, be, ft, fn, y_labels in b.loadfile(
                        f"{new_plots_dir}/{dirname}/{filename}").values():
                    y_true.extend(y_labels[0])
                    y_score.extend(y_labels[1])
                fpr, tpr, thresholds = roc_curve(y_true, y_score)
                auc = roc_auc_score(y_true, y_score)
                plt.plot(fpr, tpr, label=f"{dirname} - {round(auc,4)}")
    if old_plots_dir:
        for filename in os.listdir(old_plots_dir):
            if filename.startswith("roc-all"):
                with open(f"{old_plots_dir}/{filename}") as file:
                    x, y = [], []
                    for line in file:
                        sp = line.split()[:2]
                        x.append(float(sp[0]))
                        y.append(float(sp[1]))
                    plt.plot(y, x, label=f"{filename}")
    plt.xlabel('False positive rate', fontsize=fontsize)
    plt.ylabel('True positive rate', fontsize=fontsize)
    plt.legend(loc='best')
    plt.savefig(f"{new_plots_dir}/all_roc")
    plt.show()
Пример #5
0
def load_pn_files(use_rnaz, use_filters, numneg, randseed, debug):
    fn = f"{tmpdirectory}/pn_{use_rnaz}_{use_filters}_{numneg}_{randseed}_{debug}.json"

    # If a file with the loaded files already exists, skip loadfiles.loaddata()
    if os.path.isfile(fn):
        p, n = b.loadfile(fn)  # pos, neg from loaded file
    else:
        if use_filters:
            p, n = loadfiles.loaddata("data", numneg, randseed, use_rnaz)
        else:
            p, n = loadfiles.loaddata("data", numneg, randseed, use_rnaz, 'both', blacklist_file="noblacklist")
        b.dumpfile((p, n), fn)
    return p, n
Пример #6
0
def showresults(args, resultfile="results/results.json", showplots=True):
    results = b.loadfile(resultfile)
    estimators = defaultdict(lambda: defaultdict(list))
    plt.rcParams.update({'font.size': 22})
    ftlists = []
    c = Counter()
    y_true = []
    y_score = []
    for scores, best_esti, ftlist, fname, y_labels in results.values():
        esti_name, params = best_esti
        best_esti_score, test_score, accuracy_score = scores
        if best_esti_score == -1:
            params["best_esti_score"] = None
        else:
            params["best_esti_score"] = round(best_esti_score, 4)
        params["test_score"] = round(test_score, 4)
        params["accuracy_score"] = [round(acc, 4) for acc in accuracy_score]
        for key, value in params.items():
            estimators[esti_name][key].append(value)
        ftlists.append((fname, ftlist))  # ?
        c.update(ftlist)
        y_true.extend(y_labels[0])
        y_score.extend(y_labels[1])
    if "f" in args:
        pprint(c.most_common())
        print("\n")
    if "e" in args:
        for key in estimators.keys():
            avg_tpr = 0  # Recall
            avg_tnr = 0
            avg_precision = 0
            print(f"{key}:")
            print("-" * (len(key) + 1))
            for param in estimators[key].items():
                print(f"{param[0]}: {param[1]}")
            for tpr, tnr, precision in estimators[key][
                    "accuracy_score"]:  # ROUNDED accuracies
                avg_tpr += tpr  # Recall
                avg_tnr += tnr
                avg_precision += precision
            i = len(estimators[key]["accuracy_score"])
            avg_tpr, avg_tnr, avg_precision = avg_tpr / i, avg_tnr / i, avg_precision / i
            print(f"Average TPR: {avg_tpr}")
            print(f"Average TNR: {avg_tnr}")
            print(f"Average Precision: {avg_precision}")
            #print(0.05*0.7, avg_precision * avg_tpr, avg_precision, avg_tpr)
            #print(f"{(avg_precision*avg_tpr)}")#
            #print(f"{(avg_precision+avg_tpr)}")#
            print(
                f"Average F1: {2*((avg_precision*avg_tpr)/(avg_precision+avg_tpr))}"
            )
            print("\n")
    if "l" in args:
        for x in ftlists:
            pprint((x[0], len(x[1]), sorted(x[1])))
    if "r" in args:
        plt.figure(figsize=(12.8, 9.6))
        fpr, tpr, thresholds = roc_curve(y_true, y_score)
        auc = roc_auc_score(y_true, y_score)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.plot(fpr, tpr, label=f"{round(auc, 4)}")
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        plt.legend(loc='best')
        plt.savefig("results/results_roc")
        if showplots:
            plt.show()
    if "p" in args:
        plt.figure(figsize=(12.8, 9.6))
        precision, recall, thresholds = precision_recall_curve(y_true, y_score)
        plt.plot(recall, precision)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.savefig("results/results_precision_recall")
        if showplots:
            plt.show()
    if "c" in args:
        df = pd.read_pickle("results/dataframe.pkl")
        cm = sns.clustermap(dfcorr)
        plt.savefig("results/clustermap.png")
    if "h" in args:
        print("Usage: pig.py -r {fenrp}\n", \
              "f - featurelists with number of occurences\n", \
              "e - estimators\n", \
              "l - Shows ALL featurelists with the info used to create them\n", \
              "r - Creates and plots the roc_curve\n", \
              "p - Creates and plots the precision_recall_curve", \
              "c - Creates and plots the clustered correlationmatrix")
Пример #7
0
def getresults2(numrandomtasks=10000, n_best=10, tmpdirectory="tmp"):
    """
    Calculates the average F1 scores random featurelist over every fold.
    Collects these into a histogram and dumps them into the results directory.
    Also takes the n_best Featurelists with the best average F1-Scores and dumps them too.

    Warning:
      This program will not work correctly if:
        - numrandomtasks is not the correct value
        - pig.py was executed with other feature selection methods than --random
        - pig.py was executed with more than one classifier

    Example:
      - Executed pig.py --random 40 10000 -n 7
      => Results in 70000 files with 40 features each.
      File 0-9999 beeing 10000 different featurelists for the 1. fold.
      Files 10000-19999 beeing the same 10000 featurelists for the 2. fold.
      ...
      - This code will take taskid % numrandomtasks and calculate the
        average F1 score for each of these featurelists over every fold.
      => Takes 7 files each and calculates their average F1 score
      => Results in 10000 scores in total.

    Args:
      numrandomtasks(int): Needs to be the same number as the 2. argument of
                           --random in pig.py, so the number of
                           random featurelists per fold.
      n_best(int): Number of best featurelists that should be saved seperately
      tmpdirectory(String): Location of the "tmp" directory.
    """
    score_d = defaultdict(list)
    avg_score_d = defaultdict(list)
    featurelist_d = defaultdict(list)

    i = 0
    j = 0
    for rfile in os.listdir(f"{tmpdirectory}/task_results"):
        taskid = int(rfile.split(".")[0])
        f = b.loadfile(f"{tmpdirectory}/task_results/{rfile}")
        scores = f[1]
        fl = f[3]  # Featurelist
        tpr, precision = scores[2][0], scores[2][2]
        if np.isnan(precision):
            i += 1
        score_d[taskid % numrandomtasks].append((tpr, precision))
        # 10.000 Dictionary Entries mit 7 score tuples
        featurelist_d[taskid % numrandomtasks] = fl
        # 10.000 different Featurelists

    # Calculate average F1-Scores of each entry
    best_f1_score = 0
    best_key = 0
    f1_list = []  # Used for Histogram
    for key in score_d:
        sum_tpr, sum_precision = 0, 0
        for tpr, precision in score_d[key]:
            sum_tpr += tpr
            sum_precision += precision
        avg_tpr, avg_precision = sum_tpr / len(
            score_d[key]), sum_precision / len(score_d[key])
        f1 = 2 * ((avg_precision * avg_tpr) / (avg_precision + avg_tpr))
        if np.isnan(f1):
            j += 1
        f1_list.append(f1)
        avg_score_d[key] = f1

    # Get the best n_best featurelists
    best_featurelists = {}
    for key in dict(
            sorted(avg_score_d.items(), key=itemgetter(1),
                   reverse=True)[:n_best]).keys():
        best_featurelists[key] = (avg_score_d[key], featurelist_d[key])
    b.dumpfile(best_featurelists, f"results/best_featurelists.json")

    # Draw the histogram
    fontsize = 18  # Size of the text for the labels and legend.
    plt.figure(figsize=(12.8, 9.6))
    plt.xlabel("F1-Score", fontsize=fontsize)
    plt.ylabel("Number of Scores", fontsize=fontsize)
    plt.hist(f1_list, bins=100)
    plt.savefig("results/f1_histogram.png")