def main(): parser = argparse.ArgumentParser( description="Calculate TransFIC cutoffs") cmd = DefaultCommandHelper(parser) parser.add_argument("ranges_path", metavar="RANGES_PATH", help="JSON file generated with pred-list containing predictors stats. Only min and max are used.") parser.add_argument("scores_path", metavar="SCORES_PATH", help="The dataset with scores for non recurrent and highly recurrent. ID column should be NON/HIGH for non-rec/highly-rec datasets.") parser.add_argument("-o", dest="out_path", metavar="OUT_PATH", help="The file where cutoffs will be saved. Use - for standard output.") cmd.add_selected_predictors_args() parser.add_argument("-P", "--precision", dest="precision", metavar="PRECISION", type=int, default=3, help="Distribution precision") cmd.add_transform_args() args, logger = cmd.parse_args("cutoffs") if args.out_path is None: prefix = os.path.splitext(os.path.basename(args.scores_path))[0] if prefix.endswith("-scores"): prefix = prefix[:-7] args.out_path = os.path.join(os.getcwd(), "{}-cutoffs.json".format(prefix)) try: logger.info("Loading ranges from {} ...".format(os.path.basename(args.ranges_path))) with open(args.ranges_path) as f: pred_stats = json.load(f) predictor_range = {} for pid, pstats in pred_stats.items(): predictor_range[pid] = (pstats["min"], pstats["max"]) transforms = cmd.get_transforms() logger.info("Reading datasets {} ...".format(args.scores_path if args.scores_path != "-" else "from standard input")) with tsv.open(args.scores_path) as f: # Select predictors from the available predictors in the dataset or user selection column_names, column_indices = tsv.header(f) excluded_columns = set(COORD_COLUMNS) | set(["ID"]) available_predictors = [c for c in column_names if c not in excluded_columns] predictors = cmd.get_selected_predictors(available_predictors) # Initialize statistics step = 1.0 / 10**args.precision stats = dict() state = dict( predictors = predictors, stats = stats, transforms=dict([(p, [e for e, _ in t]) for p, t in transforms.items()]), precision = args.precision, step = step) for predictor in predictors: rmin, rmax = predictor_range[predictor] if predictor in predictor_range else (0.0, 1.0) dim = rmax - rmin size = int(dim / step) + 1 values = [(x * step) + rmin for x in xrange(size)] stats[predictor] = dict( rmin = rmin, rmax = rmax, dim = dim, values = values, size = size, vmin = None, vmax = None, dp = [0] * size, dn = [0] * size, cdp = [0] * size, cdn = [0] * size, cump = 0, cumn = 0, cutoff = None, cutoff_index = None) counts = load_events(f, column_indices, predictors, transforms, stats, logger) logger.info(" {}".format(", ".join(["{}={}".format(n, c) for n, c in counts.items()]))) logger.info("Calculating cumulative distribution ...") for predictor in predictors: predictor_stats = stats[predictor] dp, dn, cdp, cdn = [predictor_stats[k] for k in ["dp", "dn", "cdp", "cdn"]] cump = 0 cumn = 0 i = len(dp) - 1 while i >= 0: #cdp[i] = dp[i] + cump cump += dp[i] cdp[i] = cump cdn[i] = dn[i] + cumn cumn += dn[i] i -= 1 predictor_stats["cump"] = cump predictor_stats["cumn"] = cumn logger.info(" {}: cump={}, cumn={}".format(predictor, cump, cumn)) logger.info("Calculating cutoffs ...") for predictor in predictors: predictor_stats = stats[predictor] values, size, vmin, vmax, cump, cumn, cdp, cdn = [predictor_stats[k] for k in [ "values", "size", "vmin", "vmax", "cump", "cumn", "cdp", "cdn"]] cutoff_low_mid_index = -1 i = 0 while (i < size) and (cdp[i] / float(cump) >= 0.95): cutoff_low_mid_index = i i += 1 cutoff_low_mid = values[cutoff_low_mid_index] predictor_stats["cutoff_low_mid"] = cutoff_low_mid predictor_stats["cutoff_low_mid_index"] = cutoff_low_mid_index cutoff_mid_high_index = -1 i = 0 while (i < size) and (cdn[i] / float(cumn) >= 0.20): cutoff_mid_high_index = i i += 1 cutoff_mid_high = values[cutoff_mid_high_index] predictor_stats["cutoff_mid_high"] = cutoff_mid_high predictor_stats["cutoff_mid_high_index"] = cutoff_mid_high_index logger.info(" {}: cutoffs: vmin={}, low_mid={}, mid_high={}, vmax={}".format(predictor, vmin, cutoff_low_mid, cutoff_mid_high, vmax)) logger.info("Saving state ...") out_path = args.out_path save_weights(out_path, state) except: cmd.handle_error() return 0
def main(): parser = argparse.ArgumentParser( description="Calculate Baseline Tolerance partial statistics per feature") cmd = DefaultCommandHelper(parser) parser.add_argument("scores_path", metavar="SCORES_PATH", help="The scores file") parser.add_argument("predictors", metavar="PREDICTORS", help="Comma separated list of predictors") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="Output file.") cmd.add_transform_args() args, logger = cmd.parse_args("blt-partial") predictors = [p.strip() for p in args.predictors.split(",") if len(p.strip()) > 0] num_predictors = len(predictors) if len(predictors) == 0: logger.error("At least one predictor is needed") exit(-1) logger.info("Selected predictors: {}".format(", ".join(predictors))) transforms = cmd.get_transforms() stats = {} lost_snvs = 0 scores_path = args.scores_path logger.info("Reading scores from {} ...".format( os.path.basename(scores_path) if scores_path != "-" else "standard input")) with tsv.open(scores_path) as sf: for line_num, line in enumerate(sf): fields = line.rstrip("\n").split("\t") chrom, pos, ref, alt, feature = fields[:5] if len(feature) == 0: lost_snvs += 1 continue scores = fields[5:] if len(scores) != num_predictors: line_error(logger, scores_path, line_num, "Number of score columns does not match the number of predictors") try: scores = [float(v) if len(v) > 0 else None for v in scores] except: line_error(logger, scores_path, line_num, "Scores should be real numbers: {}".format(scores)) if feature not in stats: stats[feature] = tuple([[0, 0.0, 0.0] for p in predictors]) feature_stats = stats[feature] for i, score in enumerate(scores): if score is not None: predictor = predictors[i] if predictor in transforms: for name, func in transforms[predictor]: try: score = func(score) except: logger.error("Error transforming the {} score {} with {}".format(predictor, score, name)) exit(-1) feature_stats[i][0] += 1 feature_stats[i][1] += score feature_stats[i][2] += score * score logger.info("Saving results into {} ...".format( os.path.basename(args.out_path) if args.out_path != "-" else "standard output")) with tsv.open(args.out_path, "w") as of: tsv.write_line(of, "FEATURE", *predictors) for feature in sorted(stats.keys()): sb = [feature] feature_stats = stats[feature] for i in range(num_predictors): sb += ["/".join([repr(v) for v in feature_stats[i]])] tsv.write_line(of, *sb) logger.info("Number of SNV's = {}, lost SNV's = {}, number of features = {}".format(line_num, lost_snvs, len(stats))) return 0
def main(): parser = argparse.ArgumentParser( description="Plot cutoffs") cmd = DefaultCommandHelper(parser) parser.add_argument("path", metavar="PATH", help="The statistics json file") parser.add_argument("-o", dest="out_path", metavar="PATH", help="The path to save the plot image.") cmd.add_selected_predictors_args() parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False, help="Show the plot in interactive mode.") args, logger = cmd.parse_args("plot-cutoffs") logger.info("Loading state ...") state = load_weights(args.path) available_predictors, stats = [state[k] for k in ["predictors", "stats"]] predictors = cmd.get_selected_predictors(available_predictors) logger.info("Plotting ...") fig = plt.figure(figsize=(12.4, 10.5), dpi=100) fig.subplots_adjust(left=0.06, bottom=0.03, right=0.99, top=0.96, wspace=0.22, hspace=0.15) num_predictors = len(predictors) for i, predictor in enumerate(predictors): predictor_stats = stats[predictor] (intervals, vmin, vmax, dp, dn, cump, cumn, cdp, cdn, cutoff_low_mid, cutoff_mid_high) = [predictor_stats[k] for k in [ "values", "vmin", "vmax", "dp", "dn", "cump", "cumn", "cdp", "cdn", "cutoff_low_mid", "cutoff_mid_high"]] dax = fig.add_subplot(2, num_predictors, i + 1, title="{}".format(predictor)) cdax = fig.add_subplot(2, num_predictors, 1 * num_predictors + i + 1) dax.grid() dax.set_xlim(vmin, vmax) dax.plot(intervals, dp, "r-", alpha=0.5) dax.plot(intervals, dn, "b-", alpha=0.5) dax.plot([cutoff_low_mid, cutoff_low_mid], [0.0, max(dp + dn)], "k--") dax.plot([cutoff_mid_high, cutoff_mid_high], [0.0, max(dp + dn)], "k--") dax.axvspan(vmin, cutoff_low_mid, facecolor='g', alpha=0.3) dax.axvspan(cutoff_low_mid, cutoff_mid_high, facecolor='y', alpha=0.3) dax.axvspan(cutoff_mid_high, vmax, facecolor='r', alpha=0.3) dax.legend(('HIGH-REC', 'NON-REC'), 'upper center', ncol=2, frameon=False, prop={'size':10}) cdax.grid() cdax.set_xlim(vmin, vmax) cdax.plot(intervals, [v / float(cump) for v in cdp], "r-") cdax.plot(intervals, [v / float(cumn) for v in cdn], "b-") cdax.plot([cutoff_low_mid, cutoff_low_mid], [0.0, 1.0], "k--") cdax.plot([cutoff_mid_high, cutoff_mid_high], [0.0, 1.0], "k--") cdax.axvspan(vmin, cutoff_low_mid, facecolor='g', alpha=0.3) cdax.axvspan(cutoff_low_mid, cutoff_mid_high, facecolor='y', alpha=0.3) cdax.axvspan(cutoff_mid_high, vmax, facecolor='r', alpha=0.3) cdax.legend(('HIGH-REC', 'NON-REC'), 'upper center', ncol=2, frameon=False, prop={'size':10}) if args.out_path is not None: from matplotlib import pylab pylab.savefig(args.out_path, bbox_inches=0) if args.interactive: plt.show()