def main(): parser = argparse.ArgumentParser( description="Plot training sets statistics") parser.add_argument("path", metavar="PATH", help="The statistics json file") parser.add_argument("-o", dest="out_path", metavar="PATH", help="The path to save the plot image.") parser.add_argument("-W", "--width", dest="fig_width", metavar="WIDTH", type=int, help="The image width.") parser.add_argument("-H", "--height", dest="fig_height", metavar="HEIGHT", type=int, help="The image height.") parser.add_argument("--dpi", dest="fig_dpi", metavar="DPI", type=int, default=100, help="The image dpi.") parser.add_argument("-p", "--predictors", dest="predictor_names", metavar="NAMES", help="The names of the predictors to represent (seppareted by commas).") parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False, help="Show the plot in interactive mode.") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) log = bglogging.get_logger("plot-stats") log.info("Loading state from {} ...".format(os.path.basename(args.path))) state = load_weights(args.path) predictor_names, stats = [state[k] for k in ["predictor_names", "stats"]] if args.predictor_names is not None: valid_names = set(predictor_names) args.predictor_names = [s.strip() for s in args.predictor_names.split(",")] predictor_names = [name for name in args.predictor_names if name in valid_names] if len(predictor_names) == 0: log.error("No scores selected. Please choose between: {}".format(", ".join(valid_names))) exit(-1) #log.info("Plotting ...") fig = plt.figure(figsize=(args.fig_width or 12, args.fig_height or 10.4), dpi=args.fig_dpi or 100) alpha = 0.7 num_predictors = len(predictor_names) for i in range(num_predictors): predictor_name = predictor_names[i] predictor_stats = stats[predictor_name] (intervals, dp, dn, cump, cumn, cdp, cdn, tp, tn, fp, fn, mcc, accuracy, cutoff) = [ predictor_stats[k] for k in [ "values", "dp", "dn", "cump", "cumn", "cdp", "cdn", "tp", "tn", "fp", "fn", "mcc", "acc", "cutoff"]] dax = fig.add_subplot(4, num_predictors, i + 1, title="{}".format(predictor_name)) cdax = fig.add_subplot(4, num_predictors, 1 * num_predictors + i + 1) tfax = fig.add_subplot(4, num_predictors, 2 * num_predictors + i + 1) aax = fig.add_subplot(4, num_predictors, 3 * num_predictors + i + 1) # distribution dax.grid() dax.plot(intervals, arrdiv(dp, max(dp)), "r-", alpha=alpha) dax.plot(intervals, arrdiv(dn, max(dn)), "b-", alpha=alpha) dax.plot([cutoff, cutoff], [0.0, 1.0], "k--") dax.legend(('POS', 'NEG'), 'upper center', ncol=2, frameon=False, prop={'size':10}) # cummulative distribution cdax.grid() cdax.plot(intervals, arrdiv(cdp, cump), "r-", alpha=alpha) cdax.plot(intervals, arrdiv(cdn, cumn), "b-", alpha=alpha) cdax.plot([cutoff, cutoff], [0.0, 1.0], "k--") cdax.legend(('POS', 'NEG'), 'upper center', ncol=2, frameon=False, prop={'size':10}) # TP/FN/FP/TN tfax.grid() tfax.plot(intervals, arrdiv(tp, cump), "r-", alpha=alpha) tfax.plot(intervals, arrdiv(fn, cump), "c--", alpha=alpha) tfax.plot(intervals, arrdiv(fp, cumn), "b-", alpha=alpha) tfax.plot(intervals, arrdiv(tn, cumn), "m--", alpha=alpha) tfax.plot([cutoff, cutoff], [0.0, 1.0], "k--") tfax.legend(('TP', 'FN', 'FP', 'TN'), 'upper center', ncol=4, frameon=False, prop={'size':8}) # MCC/Accuracy aax.grid() aax.plot(intervals, mcc, "g-", alpha=alpha) aax.plot(intervals, accuracy, "y-", alpha=alpha) aax.plot([cutoff, cutoff], [0.0, 1.0], "k--") aax.legend(('MCC', 'Accuracy'), 'upper center', ncol=2, frameon=False, prop={'size':10}) if args.out_path is not None: from matplotlib import pylab log.info("Saving image into {} ...".format(os.path.basename(args.out_path))) pylab.savefig(args.out_path, bbox_inches=0) if args.interactive: plt.show()
def main(): parser = argparse.ArgumentParser( description="Calculate TransFIC labels") cmd = Command.withtraits(DbTrait, PredictorsInDbTrait)(parser) cmd.add_db_args() parser.add_argument("cutoffs_path", metavar="CUTOFFS", help="File containing the cutoffs") cmd.add_selected_predictors_args() parser.add_argument("-u", "--updated-predictors", dest="updated_predictors", metavar="NAMES", help="Updated predictor names") args, logger = cmd.parse_args("calc-label") db = cmd.open_db() try: logger.info("Loading state ...") state = load_weights(args.cutoffs_path) avail_predictors, stats = [state[k] for k in ["predictors", "stats"]] predictors = cmd.get_selected_predictors(default_all=True) missing_predictors = [p for p in predictors if p not in set(avail_predictors)] if len(missing_predictors) > 0: raise Exception("Missing cutoff stats for predictors: {}".format(", ".join(missing_predictors))) if args.updated_predictors is not None: if len(predictors) != len(args.updated_predictors): raise Exception("The number of selected predictors does not match the number of predictor names to update") updated_predictors = dict([(p, u) for p, u in zip(predictors, args.updated_predictors)]) else: updated_predictors = dict([(p, "{}_LABEL".format(p)) for p in predictors]) # create predictors in the database if required db_predictors = set([p["id"] for p in db.predictors()]) for predictor, updated_predictor in updated_predictors.items(): if updated_predictor not in db_predictors: logger.info("Creating predictor {} ...".format(updated_predictor)) db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor]) cutoffs = {} for predictor in predictors: cutoff_low_mid, cutoff_mid_high = [stats[predictor][v] for v in ["cutoff_low_mid", "cutoff_mid_high"]] logger.info("{}: cutoffs: low_mid={}, mid_high={}".format(predictor, cutoff_low_mid, cutoff_mid_high)) cutoffs[predictor] = (cutoff_low_mid, cutoff_mid_high) logger.info("Calculating ...") progress = RatedProgress(logger, name="SNVs") for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1): scores = row["scores"] uscores = {} for predictor in predictors: score = scores[predictor] if score is None: continue cutoff_low_mid, cutoff_mid_high = cutoffs[predictor] updated_predictor = updated_predictors[predictor] uscores[updated_predictor] = 0.0 if score < cutoff_low_mid else 1.0 if score < cutoff_mid_high else 2.0 if len(uscores) > 0: db.update_scores(row["id"], uscores) progress.update() db.commit() except: cmd.handle_error() return 0
def main(): parser = argparse.ArgumentParser( description="Calculate Condel label") parser.add_argument("db_path", metavar="DB_PATH", help="Functional scores database") parser.add_argument("weights_path", metavar="WEIGHTS", help="File containing the scores weights and cutoffs") parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS", help="Comma separated list of predictors") parser.add_argument("-u", "--updated-predictors", dest="updated_predictors", metavar="NAMES", help="Updated predictor names") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) log = bglogging.get_logger("calculate-label") log.info("Opening functional scores database ...") db = FannsSQLiteDb(args.db_path) db.open() log.info("Loading state ...") state = load_weights(args.weights_path) avail_predictors, precision, step, stats = [state[k] for k in ["predictor_names", "precision", "step", "stats"]] if args.predictors is not None: predictors = [p for p in [p.strip() for p in args.predictors.split(",")] if p in avail_predictors] if len(predictors) == 0: log.error("Unknown predictors: {}".format(args.predictors)) log.error("Available predictor names are: {}".format(", ".join(avail_predictors))) exit(-1) else: predictors = avail_predictors if args.updated_predictors is not None: updated_predictors = [p.strip() for p in args.updated_predictors.split(",")] if len(predictors) != len(updated_predictors): log.error("Number of updated predictors does not match with the list of number of predictors") exit(-1) else: updated_predictors = ["{}_CLASS".format(p.upper()) for p in predictors] log.info("Available predictors: {}".format(", ".join(avail_predictors))) log.info("Selected predictors: {}".format(", ".join(predictors))) for predictor, updated_predictor in zip(predictors, updated_predictors): log.info("Creating predictor {} ...".format(updated_predictor)) db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor]) cutoffs = [] for predictor in predictors: cutoff, mcc, acc = [stats[predictor][v] for v in ["cutoff", "cutoff_mcc", "cutoff_acc"]] log.info("{}: cutoff={}, MCC={}, accuracy={}".format(predictor, cutoff, mcc, acc)) cutoffs += [cutoff] log.info("Calculating ...") start_time = partial_start_time = time.time() try: for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1): scores = row["scores"] d = {} for i, predictor in enumerate(predictors): score = scores[predictor] if score is None: continue cutoff = cutoffs[i] updated_predictor = updated_predictors[i] d[updated_predictor] = 0.0 if score < cutoff else 1.0 db.update_scores(row["id"], d) partial_time = time.time() - partial_start_time if partial_time > 5.0: partial_start_time = time.time() elapsed_time = time.time() - start_time log.debug(" {} rows, {:.1f} rows/second".format(hsize(num_rows), num_rows / elapsed_time)) db.commit() except KeyboardInterrupt: log.warn("Interrupted by Ctrl-C") db.rollback() except: db.rollback() raise finally: db.close()
def main(): parser = argparse.ArgumentParser( description="Plot training sets statistics") parser.add_argument("path", metavar="PATH", help="The statistics json file") parser.add_argument("-o", dest="out_path", metavar="PATH", help="The path to save the plot image.") parser.add_argument("-p", "--predictors", dest="predictor_names", metavar="NAMES", help="The names of the predictors to represent (seppareted by commas).") parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False, help="Show the plot in interactive mode.") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) log = bglogging.get_logger("plot-stats") log.info("Loading state ...") state = load_weights(args.path) predictor_names, stats = [state[k] for k in ["predictor_names", "stats"]] if args.predictor_names is not None: valid_names = set(predictor_names) args.predictor_names = [s.strip() for s in args.predictor_names.split(",")] predictor_names = [name for name in args.predictor_names if name in valid_names] if len(predictor_names) == 0: log.error("No scores selected. Please choose between: {}".format(", ".join(valid_names))) exit(-1) log.info("Plotting ...") fig = plt.figure() ax = fig.add_subplot(111) ax.grid() ax.set_xlabel("False Positive Rate (1 - especificity)") ax.set_ylabel("True Positive Rate (sensitivity)") num_predictors = len(predictor_names) for predictor_name in predictor_names: predictor_stats = stats[predictor_name] (size, tp, tn, fp, fn) = [predictor_stats[k] for k in ["size", "tp", "tn", "fp", "fn"]] tpr = [1.0] * (size + 1) fpr = [1.0] * (size + 1) for i in range(size): tpr[i + 1] = (float(tp[i]) / (tp[i] + fn[i])) fpr[i + 1] = (float(fp[i]) / (fp[i] + tn[i])) ax.plot(fpr, tpr, "-") ax.legend(tuple(predictor_names), "lower right", shadow=False) ax.plot([0.0, 1.0], [0.0, 1.0], "--", color="0.75") if args.out_path is not None: from matplotlib import pylab pylab.savefig(args.out_path, bbox_inches=0) if args.interactive: plt.show()
def main(): parser = argparse.ArgumentParser( description="Calculate Condel score") parser.add_argument("db_path", metavar="DB_PATH", help="Functional scores database") parser.add_argument("weights_path", metavar="WEIGHTS", help="File containing the scores weights and cutoffs") parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS", help="Comma separated list of predictors") parser.add_argument("-u", "--updated-predictor", dest="updated_predictor", metavar="NAME", help="Updated predictor name") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) log = bglogging.get_logger("calculate") log.info("Opening functional scores database ...") db = FannsSQLiteDb(args.db_path) db.open() updated_predictor = args.updated_predictor or "CONDEL" predictors = set([p["id"] for p in db.predictors()]) if updated_predictor not in predictors: log.info(" Creating predictor {} ...".format(updated_predictor)) db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=predictors) log.info("Loading state ...") state = load_weights(args.weights_path) avail_predictors, precision, step, stats = [state[k] for k in ["predictor_names", "precision", "step", "stats"]] if args.predictors is not None: predictors = [p for p in [p.strip() for p in args.predictors.split(",")] if p in avail_predictors] if len(predictors) == 0: log.error("Unknown predictors: {}".format(args.predictors)) log.error("Available predictor names are: {}".format(", ".join(avail_predictors))) exit(-1) else: predictors = avail_predictors log.info("Available predictors: {}".format(", ".join(avail_predictors))) log.info("Selected predictors: {}".format(", ".join(predictors))) log.info("Calculating ...") start_time = partial_start_time = time.time() try: for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1): scores = row["scores"] condel = wsum = 0 for predictor, score in scores.items(): if score is None: continue predictor_stats = stats[predictor] rmin, rmax, dim, size, cdp, cdn, cutoff = [predictor_stats[k] for k in [ "rmin", "rmax", "dim", "size", "cdp", "cdn", "cutoff"]] if predictor in PREDICTOR_TRANSFORM: score = PREDICTOR_TRANSFORM[predictor](score) r = (score - rmin) / dim index = int(r * size) if score < rmax else size - 1 if score < cutoff: w = 1 - cdn[index] else: w = 1 - cdp[index] wsum += w condel += w * score #log.info("{}={}, w={} -> {}".format(predictor_name, score, w, score * w)) if wsum != 0: condel /= wsum d = {updated_predictor : condel} db.update_scores(row["id"], d) #log.info(">>> CONDEL={}".format(condel)) else: log.warn("wsum = 0, condel={}, scores={}".format(condel, repr(scores))) partial_time = time.time() - partial_start_time if partial_time > 5.0: partial_start_time = time.time() elapsed_time = time.time() - start_time log.debug(" {} rows, {:.1f} rows/second".format(hsize(num_rows), num_rows / elapsed_time)) log.info("Commit ...") db.commit() except KeyboardInterrupt: log.warn("Interrupted by Ctrl-C") db.rollback() except: db.rollback() raise finally: db.close()
def main(): parser = argparse.ArgumentParser( description="Plot cutoffs") cmd = DefaultCommandHelper(parser) parser.add_argument("path", metavar="PATH", help="The statistics json file") parser.add_argument("-o", dest="out_path", metavar="PATH", help="The path to save the plot image.") cmd.add_selected_predictors_args() parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False, help="Show the plot in interactive mode.") args, logger = cmd.parse_args("plot-cutoffs") logger.info("Loading state ...") state = load_weights(args.path) available_predictors, stats = [state[k] for k in ["predictors", "stats"]] predictors = cmd.get_selected_predictors(available_predictors) logger.info("Plotting ...") fig = plt.figure(figsize=(12.4, 10.5), dpi=100) fig.subplots_adjust(left=0.06, bottom=0.03, right=0.99, top=0.96, wspace=0.22, hspace=0.15) num_predictors = len(predictors) for i, predictor in enumerate(predictors): predictor_stats = stats[predictor] (intervals, vmin, vmax, dp, dn, cump, cumn, cdp, cdn, cutoff_low_mid, cutoff_mid_high) = [predictor_stats[k] for k in [ "values", "vmin", "vmax", "dp", "dn", "cump", "cumn", "cdp", "cdn", "cutoff_low_mid", "cutoff_mid_high"]] dax = fig.add_subplot(2, num_predictors, i + 1, title="{}".format(predictor)) cdax = fig.add_subplot(2, num_predictors, 1 * num_predictors + i + 1) dax.grid() dax.set_xlim(vmin, vmax) dax.plot(intervals, dp, "r-", alpha=0.5) dax.plot(intervals, dn, "b-", alpha=0.5) dax.plot([cutoff_low_mid, cutoff_low_mid], [0.0, max(dp + dn)], "k--") dax.plot([cutoff_mid_high, cutoff_mid_high], [0.0, max(dp + dn)], "k--") dax.axvspan(vmin, cutoff_low_mid, facecolor='g', alpha=0.3) dax.axvspan(cutoff_low_mid, cutoff_mid_high, facecolor='y', alpha=0.3) dax.axvspan(cutoff_mid_high, vmax, facecolor='r', alpha=0.3) dax.legend(('HIGH-REC', 'NON-REC'), 'upper center', ncol=2, frameon=False, prop={'size':10}) cdax.grid() cdax.set_xlim(vmin, vmax) cdax.plot(intervals, [v / float(cump) for v in cdp], "r-") cdax.plot(intervals, [v / float(cumn) for v in cdn], "b-") cdax.plot([cutoff_low_mid, cutoff_low_mid], [0.0, 1.0], "k--") cdax.plot([cutoff_mid_high, cutoff_mid_high], [0.0, 1.0], "k--") cdax.axvspan(vmin, cutoff_low_mid, facecolor='g', alpha=0.3) cdax.axvspan(cutoff_low_mid, cutoff_mid_high, facecolor='y', alpha=0.3) cdax.axvspan(cutoff_mid_high, vmax, facecolor='r', alpha=0.3) cdax.legend(('HIGH-REC', 'NON-REC'), 'upper center', ncol=2, frameon=False, prop={'size':10}) if args.out_path is not None: from matplotlib import pylab pylab.savefig(args.out_path, bbox_inches=0) if args.interactive: plt.show()