def main(): parser = argparse.ArgumentParser( description="Calculate TransFIC cutoffs") cmd = DefaultCommandHelper(parser) parser.add_argument("ranges_path", metavar="RANGES_PATH", help="JSON file generated with pred-list containing predictors stats. Only min and max are used.") parser.add_argument("scores_path", metavar="SCORES_PATH", help="The dataset with scores for non recurrent and highly recurrent. ID column should be NON/HIGH for non-rec/highly-rec datasets.") parser.add_argument("-o", dest="out_path", metavar="OUT_PATH", help="The file where cutoffs will be saved. Use - for standard output.") cmd.add_selected_predictors_args() parser.add_argument("-P", "--precision", dest="precision", metavar="PRECISION", type=int, default=3, help="Distribution precision") cmd.add_transform_args() args, logger = cmd.parse_args("cutoffs") if args.out_path is None: prefix = os.path.splitext(os.path.basename(args.scores_path))[0] if prefix.endswith("-scores"): prefix = prefix[:-7] args.out_path = os.path.join(os.getcwd(), "{}-cutoffs.json".format(prefix)) try: logger.info("Loading ranges from {} ...".format(os.path.basename(args.ranges_path))) with open(args.ranges_path) as f: pred_stats = json.load(f) predictor_range = {} for pid, pstats in pred_stats.items(): predictor_range[pid] = (pstats["min"], pstats["max"]) transforms = cmd.get_transforms() logger.info("Reading datasets {} ...".format(args.scores_path if args.scores_path != "-" else "from standard input")) with tsv.open(args.scores_path) as f: # Select predictors from the available predictors in the dataset or user selection column_names, column_indices = tsv.header(f) excluded_columns = set(COORD_COLUMNS) | set(["ID"]) available_predictors = [c for c in column_names if c not in excluded_columns] predictors = cmd.get_selected_predictors(available_predictors) # Initialize statistics step = 1.0 / 10**args.precision stats = dict() state = dict( predictors = predictors, stats = stats, transforms=dict([(p, [e for e, _ in t]) for p, t in transforms.items()]), precision = args.precision, step = step) for predictor in predictors: rmin, rmax = predictor_range[predictor] if predictor in predictor_range else (0.0, 1.0) dim = rmax - rmin size = int(dim / step) + 1 values = [(x * step) + rmin for x in xrange(size)] stats[predictor] = dict( rmin = rmin, rmax = rmax, dim = dim, values = values, size = size, vmin = None, vmax = None, dp = [0] * size, dn = [0] * size, cdp = [0] * size, cdn = [0] * size, cump = 0, cumn = 0, cutoff = None, cutoff_index = None) counts = load_events(f, column_indices, predictors, transforms, stats, logger) logger.info(" {}".format(", ".join(["{}={}".format(n, c) for n, c in counts.items()]))) logger.info("Calculating cumulative distribution ...") for predictor in predictors: predictor_stats = stats[predictor] dp, dn, cdp, cdn = [predictor_stats[k] for k in ["dp", "dn", "cdp", "cdn"]] cump = 0 cumn = 0 i = len(dp) - 1 while i >= 0: #cdp[i] = dp[i] + cump cump += dp[i] cdp[i] = cump cdn[i] = dn[i] + cumn cumn += dn[i] i -= 1 predictor_stats["cump"] = cump predictor_stats["cumn"] = cumn logger.info(" {}: cump={}, cumn={}".format(predictor, cump, cumn)) logger.info("Calculating cutoffs ...") for predictor in predictors: predictor_stats = stats[predictor] values, size, vmin, vmax, cump, cumn, cdp, cdn = [predictor_stats[k] for k in [ "values", "size", "vmin", "vmax", "cump", "cumn", "cdp", "cdn"]] cutoff_low_mid_index = -1 i = 0 while (i < size) and (cdp[i] / float(cump) >= 0.95): cutoff_low_mid_index = i i += 1 cutoff_low_mid = values[cutoff_low_mid_index] predictor_stats["cutoff_low_mid"] = cutoff_low_mid predictor_stats["cutoff_low_mid_index"] = cutoff_low_mid_index cutoff_mid_high_index = -1 i = 0 while (i < size) and (cdn[i] / float(cumn) >= 0.20): cutoff_mid_high_index = i i += 1 cutoff_mid_high = values[cutoff_mid_high_index] predictor_stats["cutoff_mid_high"] = cutoff_mid_high predictor_stats["cutoff_mid_high_index"] = cutoff_mid_high_index logger.info(" {}: cutoffs: vmin={}, low_mid={}, mid_high={}, vmax={}".format(predictor, vmin, cutoff_low_mid, cutoff_mid_high, vmax)) logger.info("Saving state ...") out_path = args.out_path save_weights(out_path, state) except: cmd.handle_error() return 0
def main(): parser = argparse.ArgumentParser( description="Plot cutoffs") cmd = DefaultCommandHelper(parser) parser.add_argument("path", metavar="PATH", help="The statistics json file") parser.add_argument("-o", dest="out_path", metavar="PATH", help="The path to save the plot image.") cmd.add_selected_predictors_args() parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False, help="Show the plot in interactive mode.") args, logger = cmd.parse_args("plot-cutoffs") logger.info("Loading state ...") state = load_weights(args.path) available_predictors, stats = [state[k] for k in ["predictors", "stats"]] predictors = cmd.get_selected_predictors(available_predictors) logger.info("Plotting ...") fig = plt.figure(figsize=(12.4, 10.5), dpi=100) fig.subplots_adjust(left=0.06, bottom=0.03, right=0.99, top=0.96, wspace=0.22, hspace=0.15) num_predictors = len(predictors) for i, predictor in enumerate(predictors): predictor_stats = stats[predictor] (intervals, vmin, vmax, dp, dn, cump, cumn, cdp, cdn, cutoff_low_mid, cutoff_mid_high) = [predictor_stats[k] for k in [ "values", "vmin", "vmax", "dp", "dn", "cump", "cumn", "cdp", "cdn", "cutoff_low_mid", "cutoff_mid_high"]] dax = fig.add_subplot(2, num_predictors, i + 1, title="{}".format(predictor)) cdax = fig.add_subplot(2, num_predictors, 1 * num_predictors + i + 1) dax.grid() dax.set_xlim(vmin, vmax) dax.plot(intervals, dp, "r-", alpha=0.5) dax.plot(intervals, dn, "b-", alpha=0.5) dax.plot([cutoff_low_mid, cutoff_low_mid], [0.0, max(dp + dn)], "k--") dax.plot([cutoff_mid_high, cutoff_mid_high], [0.0, max(dp + dn)], "k--") dax.axvspan(vmin, cutoff_low_mid, facecolor='g', alpha=0.3) dax.axvspan(cutoff_low_mid, cutoff_mid_high, facecolor='y', alpha=0.3) dax.axvspan(cutoff_mid_high, vmax, facecolor='r', alpha=0.3) dax.legend(('HIGH-REC', 'NON-REC'), 'upper center', ncol=2, frameon=False, prop={'size':10}) cdax.grid() cdax.set_xlim(vmin, vmax) cdax.plot(intervals, [v / float(cump) for v in cdp], "r-") cdax.plot(intervals, [v / float(cumn) for v in cdn], "b-") cdax.plot([cutoff_low_mid, cutoff_low_mid], [0.0, 1.0], "k--") cdax.plot([cutoff_mid_high, cutoff_mid_high], [0.0, 1.0], "k--") cdax.axvspan(vmin, cutoff_low_mid, facecolor='g', alpha=0.3) cdax.axvspan(cutoff_low_mid, cutoff_mid_high, facecolor='y', alpha=0.3) cdax.axvspan(cutoff_mid_high, vmax, facecolor='r', alpha=0.3) cdax.legend(('HIGH-REC', 'NON-REC'), 'upper center', ncol=2, frameon=False, prop={'size':10}) if args.out_path is not None: from matplotlib import pylab pylab.savefig(args.out_path, bbox_inches=0) if args.interactive: plt.show()