def calculate(idd, n_jobs, debug): """Executes FS and RPS for a given task. Executed by cluster. Args: idd (int): Jobid. Used to find the right task n_jobs (int): Number of parallel jobs made by the random parameter search. Does nothing otherwise. """ # task = Foldnr, mask, clfname, ftlist, fname, randseed task = b.loadfile(f"{tmpdirectory}/tasks.json")[ idd] # = (foldnr, fstype, args, clfname, randseed) or (foldnr, clfname, randseed) foldxy = np.load(f"{tmpdirectory}/folds.pkl", allow_pickle=True)[task[0]] df = pd.read_pickle(f"{tmpdirectory}/dataframe.pkl") if len(task) == 5: # Normal procedure with Feature Selection first. foldnr, fstype, args, clfname, randseed = task ftlist, mask, fname = fs.feature_selection(foldxy, fstype, args, df) # FS - Done. elif len(task) == 3: # A set featurelist was used. foldnr, clfname, randseed = task ftlist = b.loadfile(f"{tmpdirectory}/set_fl.json") mask = [True if f in ftlist else False for f in df.columns] fname = "Set Featurelist" else: raise ValueError("Incorrect number of arguments in the taskfile: {len(task)} should be 5 or 3") scores, best_esti, y_labels, coefs = rps.random_param_search(mask, clfname, foldxy, n_jobs, df, randseed, debug) ###### best_esti_params = best_esti.get_params() best_esti = (type(best_esti).__name__, best_esti_params) # Creates readable tuple that can be dumped. b.dumpfile([foldnr, scores, best_esti, ftlist, fname, y_labels], f"{tmpdirectory}/task_results/{idd}.json")
def plotall_precision_recall(new_plots_dir, exclude_string=None): """Similar to plotall_roc() but for precision-recall curves. Does not include the option to include files from different programs. The rest of the comments are the same. """ fontsize = 18 # Size of the text for the labels and legend. plt.figure(figsize=(12.8, 9.6)) for dirname in os.listdir(new_plots_dir): if not os.path.isdir(f"{new_plots_dir}/{dirname}"): continue elif not exclude_string == None and exclude_string in dirname: continue for filename in os.listdir(f"{new_plots_dir}/{dirname}"): if filename.startswith("results.json"): y_true, y_score = [], [] for sc, be, ft, fn, y_labels in b.loadfile( f"{new_plots_dir}/{dirname}/{filename}").values(): y_true.extend(y_labels[0]) y_score.extend(y_labels[1]) precision, recall, thresholds = precision_recall_curve( y_true, y_score) plt.plot(recall, precision, label=f"{dirname}") plt.xlabel('Recall', fontsize=fontsize) plt.ylabel('Precision', fontsize=fontsize) plt.legend(loc='best') plt.savefig(f"{new_plots_dir}/all_precision_recall") plt.show()
def getresults(): """Analyzes the result files in rps_results and returns only the ones with the best best_esti_score in each fold. """ results = defaultdict(lambda: [[0]]) for rfile in os.listdir(f"{tmpdirectory}/task_results"): f = b.loadfile(f"{tmpdirectory}/task_results/{rfile}") if f[1][0] > results[f[0]][0][0] or f[1][0] == -1: # Remove this == -1 part # For each fold the result with the best best_esti_score is saved # If the best_esti_score is -1 it means a set classifier was used. results[f[0]] = f[1:] b.dumpfile(results, f"results/results.json")
def plotall_roc(new_plots_dir, old_plots_dir, exclude_string=None): """Experimental function that allows to easily draw ROC figures from different runs into a single file. Optionally can also draw plots from specific older files. Note that these older files need to have a very specific file format. (This function was only added for convenience use to compare results of pig.py with different programs) Args: new_plots_dir(string): Directory of runs to plot. This directory should contain subdirectories with their runnames as directorynames and those need to contain a "results.json" file. Example: new_plots_dir/gradientboosting42/results.json old_plots_dir(string): Directory containing old roc files. Those files should start with "roc-all." exclude_string(string): If existing it will ignore subdirectories that contain this string in their name. Example: If you have gradientboosting and neuralnet runs set it to "neural" to only plot gradientboosting runs. Returns: Nothing but saves the resulting ROC figure into the new_plots_dir. """ fontsize = 18 # Size of the text for the labels and legend. plt.figure(figsize=(12.8, 9.6)) plt.plot([0, 1], [0, 1], 'k--') for dirname in os.listdir(new_plots_dir): if not os.path.isdir(f"{new_plots_dir}/{dirname}"): continue elif not exclude_string == None and exclude_string in dirname: continue for filename in os.listdir(f"{new_plots_dir}/{dirname}"): if filename.startswith("results.json"): y_true, y_score = [], [] for sc, be, ft, fn, y_labels in b.loadfile( f"{new_plots_dir}/{dirname}/{filename}").values(): y_true.extend(y_labels[0]) y_score.extend(y_labels[1]) fpr, tpr, thresholds = roc_curve(y_true, y_score) auc = roc_auc_score(y_true, y_score) plt.plot(fpr, tpr, label=f"{dirname} - {round(auc,4)}") if old_plots_dir: for filename in os.listdir(old_plots_dir): if filename.startswith("roc-all"): with open(f"{old_plots_dir}/{filename}") as file: x, y = [], [] for line in file: sp = line.split()[:2] x.append(float(sp[0])) y.append(float(sp[1])) plt.plot(y, x, label=f"{filename}") plt.xlabel('False positive rate', fontsize=fontsize) plt.ylabel('True positive rate', fontsize=fontsize) plt.legend(loc='best') plt.savefig(f"{new_plots_dir}/all_roc") plt.show()
def load_pn_files(use_rnaz, use_filters, numneg, randseed, debug): fn = f"{tmpdirectory}/pn_{use_rnaz}_{use_filters}_{numneg}_{randseed}_{debug}.json" # If a file with the loaded files already exists, skip loadfiles.loaddata() if os.path.isfile(fn): p, n = b.loadfile(fn) # pos, neg from loaded file else: if use_filters: p, n = loadfiles.loaddata("data", numneg, randseed, use_rnaz) else: p, n = loadfiles.loaddata("data", numneg, randseed, use_rnaz, 'both', blacklist_file="noblacklist") b.dumpfile((p, n), fn) return p, n
def showresults(args, resultfile="results/results.json", showplots=True): results = b.loadfile(resultfile) estimators = defaultdict(lambda: defaultdict(list)) plt.rcParams.update({'font.size': 22}) ftlists = [] c = Counter() y_true = [] y_score = [] for scores, best_esti, ftlist, fname, y_labels in results.values(): esti_name, params = best_esti best_esti_score, test_score, accuracy_score = scores if best_esti_score == -1: params["best_esti_score"] = None else: params["best_esti_score"] = round(best_esti_score, 4) params["test_score"] = round(test_score, 4) params["accuracy_score"] = [round(acc, 4) for acc in accuracy_score] for key, value in params.items(): estimators[esti_name][key].append(value) ftlists.append((fname, ftlist)) # ? c.update(ftlist) y_true.extend(y_labels[0]) y_score.extend(y_labels[1]) if "f" in args: pprint(c.most_common()) print("\n") if "e" in args: for key in estimators.keys(): avg_tpr = 0 # Recall avg_tnr = 0 avg_precision = 0 print(f"{key}:") print("-" * (len(key) + 1)) for param in estimators[key].items(): print(f"{param[0]}: {param[1]}") for tpr, tnr, precision in estimators[key][ "accuracy_score"]: # ROUNDED accuracies avg_tpr += tpr # Recall avg_tnr += tnr avg_precision += precision i = len(estimators[key]["accuracy_score"]) avg_tpr, avg_tnr, avg_precision = avg_tpr / i, avg_tnr / i, avg_precision / i print(f"Average TPR: {avg_tpr}") print(f"Average TNR: {avg_tnr}") print(f"Average Precision: {avg_precision}") #print(0.05*0.7, avg_precision * avg_tpr, avg_precision, avg_tpr) #print(f"{(avg_precision*avg_tpr)}")# #print(f"{(avg_precision+avg_tpr)}")# print( f"Average F1: {2*((avg_precision*avg_tpr)/(avg_precision+avg_tpr))}" ) print("\n") if "l" in args: for x in ftlists: pprint((x[0], len(x[1]), sorted(x[1]))) if "r" in args: plt.figure(figsize=(12.8, 9.6)) fpr, tpr, thresholds = roc_curve(y_true, y_score) auc = roc_auc_score(y_true, y_score) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr, tpr, label=f"{round(auc, 4)}") plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.legend(loc='best') plt.savefig("results/results_roc") if showplots: plt.show() if "p" in args: plt.figure(figsize=(12.8, 9.6)) precision, recall, thresholds = precision_recall_curve(y_true, y_score) plt.plot(recall, precision) plt.xlabel('Recall') plt.ylabel('Precision') plt.savefig("results/results_precision_recall") if showplots: plt.show() if "c" in args: df = pd.read_pickle("results/dataframe.pkl") cm = sns.clustermap(dfcorr) plt.savefig("results/clustermap.png") if "h" in args: print("Usage: pig.py -r {fenrp}\n", \ "f - featurelists with number of occurences\n", \ "e - estimators\n", \ "l - Shows ALL featurelists with the info used to create them\n", \ "r - Creates and plots the roc_curve\n", \ "p - Creates and plots the precision_recall_curve", \ "c - Creates and plots the clustered correlationmatrix")
def getresults2(numrandomtasks=10000, n_best=10, tmpdirectory="tmp"): """ Calculates the average F1 scores random featurelist over every fold. Collects these into a histogram and dumps them into the results directory. Also takes the n_best Featurelists with the best average F1-Scores and dumps them too. Warning: This program will not work correctly if: - numrandomtasks is not the correct value - pig.py was executed with other feature selection methods than --random - pig.py was executed with more than one classifier Example: - Executed pig.py --random 40 10000 -n 7 => Results in 70000 files with 40 features each. File 0-9999 beeing 10000 different featurelists for the 1. fold. Files 10000-19999 beeing the same 10000 featurelists for the 2. fold. ... - This code will take taskid % numrandomtasks and calculate the average F1 score for each of these featurelists over every fold. => Takes 7 files each and calculates their average F1 score => Results in 10000 scores in total. Args: numrandomtasks(int): Needs to be the same number as the 2. argument of --random in pig.py, so the number of random featurelists per fold. n_best(int): Number of best featurelists that should be saved seperately tmpdirectory(String): Location of the "tmp" directory. """ score_d = defaultdict(list) avg_score_d = defaultdict(list) featurelist_d = defaultdict(list) i = 0 j = 0 for rfile in os.listdir(f"{tmpdirectory}/task_results"): taskid = int(rfile.split(".")[0]) f = b.loadfile(f"{tmpdirectory}/task_results/{rfile}") scores = f[1] fl = f[3] # Featurelist tpr, precision = scores[2][0], scores[2][2] if np.isnan(precision): i += 1 score_d[taskid % numrandomtasks].append((tpr, precision)) # 10.000 Dictionary Entries mit 7 score tuples featurelist_d[taskid % numrandomtasks] = fl # 10.000 different Featurelists # Calculate average F1-Scores of each entry best_f1_score = 0 best_key = 0 f1_list = [] # Used for Histogram for key in score_d: sum_tpr, sum_precision = 0, 0 for tpr, precision in score_d[key]: sum_tpr += tpr sum_precision += precision avg_tpr, avg_precision = sum_tpr / len( score_d[key]), sum_precision / len(score_d[key]) f1 = 2 * ((avg_precision * avg_tpr) / (avg_precision + avg_tpr)) if np.isnan(f1): j += 1 f1_list.append(f1) avg_score_d[key] = f1 # Get the best n_best featurelists best_featurelists = {} for key in dict( sorted(avg_score_d.items(), key=itemgetter(1), reverse=True)[:n_best]).keys(): best_featurelists[key] = (avg_score_d[key], featurelist_d[key]) b.dumpfile(best_featurelists, f"results/best_featurelists.json") # Draw the histogram fontsize = 18 # Size of the text for the labels and legend. plt.figure(figsize=(12.8, 9.6)) plt.xlabel("F1-Score", fontsize=fontsize) plt.ylabel("Number of Scores", fontsize=fontsize) plt.hist(f1_list, bins=100) plt.savefig("results/f1_histogram.png")