def main(): parser = argparse.ArgumentParser(description="Extract mutations in VCF and save as simple tabulated file") parser.add_argument("vcf_paths", metavar="PATH", nargs="+", help="The VCF files") parser.add_argument("-o", dest="out_path", metavar="PATH", help="Output file. Use - for standard output.") bglogging.add_logging_arguments(self._parser) args = parser.parse_args() bglogging.initialize(self.args) log = bglogging.get_logger("vcf-to-snvs") if args.out_path is None: names = [] for path in args.vcf_paths: if path != "-": base_path, name, ext = tsv.split_path(path) names += [name] prefix = os.path.commonprefix(*names) if len(names) > 0 else "" prefix = prefix.rstrip(".") if len(prefix) == 0: prefix = "genome" args.out_path = "{}.tsv.gz".format(prefix) with tsv.open(args.out_path, "w") as outf: tsv.write_line(outf, "CHR", "POS", "REF", "ALT") for path in args.vcf_paths: log.info("Reading {} ...".format(path)) with tsv.open(path) as inf: types = (str, str, str, str) columns = [0, 1, 3, 4] for fields in tsv.lines(inf, types, columns=columns): chrom, pos, ref, alt = fields # ref = ref.upper().strip("N") # alt = alt.upper().strip("N") ref_len = len(ref) alt_len = len(alt) if ref_len != alt_len or ref_len == 0 or alt_len == 0: continue try: pos = int(pos) except: continue if ref_len == 1: tsv.write_line(outf, chrom, pos, ref, alt) else: for i in range(ref_len): tsv.write_line(outf, chrom, pos + i, ref[i], alt[i])
def parse_args(self, logger_name): bglogging.add_logging_arguments(self._parser) self.args = self._parser.parse_args() bglogging.initialize(self.args) self.logger = bglogging.get_logger(logger_name) return self.args, self.logger
def main(): parser = argparse.ArgumentParser( description="Plot training sets statistics") parser.add_argument("path", metavar="PATH", help="The statistics json file") parser.add_argument("-o", dest="out_path", metavar="PATH", help="The path to save the plot image.") parser.add_argument("-W", "--width", dest="fig_width", metavar="WIDTH", type=int, help="The image width.") parser.add_argument("-H", "--height", dest="fig_height", metavar="HEIGHT", type=int, help="The image height.") parser.add_argument("--dpi", dest="fig_dpi", metavar="DPI", type=int, default=100, help="The image dpi.") parser.add_argument("-p", "--predictors", dest="predictor_names", metavar="NAMES", help="The names of the predictors to represent (seppareted by commas).") parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False, help="Show the plot in interactive mode.") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) log = bglogging.get_logger("plot-stats") log.info("Loading state from {} ...".format(os.path.basename(args.path))) state = load_weights(args.path) predictor_names, stats = [state[k] for k in ["predictor_names", "stats"]] if args.predictor_names is not None: valid_names = set(predictor_names) args.predictor_names = [s.strip() for s in args.predictor_names.split(",")] predictor_names = [name for name in args.predictor_names if name in valid_names] if len(predictor_names) == 0: log.error("No scores selected. Please choose between: {}".format(", ".join(valid_names))) exit(-1) #log.info("Plotting ...") fig = plt.figure(figsize=(args.fig_width or 12, args.fig_height or 10.4), dpi=args.fig_dpi or 100) alpha = 0.7 num_predictors = len(predictor_names) for i in range(num_predictors): predictor_name = predictor_names[i] predictor_stats = stats[predictor_name] (intervals, dp, dn, cump, cumn, cdp, cdn, tp, tn, fp, fn, mcc, accuracy, cutoff) = [ predictor_stats[k] for k in [ "values", "dp", "dn", "cump", "cumn", "cdp", "cdn", "tp", "tn", "fp", "fn", "mcc", "acc", "cutoff"]] dax = fig.add_subplot(4, num_predictors, i + 1, title="{}".format(predictor_name)) cdax = fig.add_subplot(4, num_predictors, 1 * num_predictors + i + 1) tfax = fig.add_subplot(4, num_predictors, 2 * num_predictors + i + 1) aax = fig.add_subplot(4, num_predictors, 3 * num_predictors + i + 1) # distribution dax.grid() dax.plot(intervals, arrdiv(dp, max(dp)), "r-", alpha=alpha) dax.plot(intervals, arrdiv(dn, max(dn)), "b-", alpha=alpha) dax.plot([cutoff, cutoff], [0.0, 1.0], "k--") dax.legend(('POS', 'NEG'), 'upper center', ncol=2, frameon=False, prop={'size':10}) # cummulative distribution cdax.grid() cdax.plot(intervals, arrdiv(cdp, cump), "r-", alpha=alpha) cdax.plot(intervals, arrdiv(cdn, cumn), "b-", alpha=alpha) cdax.plot([cutoff, cutoff], [0.0, 1.0], "k--") cdax.legend(('POS', 'NEG'), 'upper center', ncol=2, frameon=False, prop={'size':10}) # TP/FN/FP/TN tfax.grid() tfax.plot(intervals, arrdiv(tp, cump), "r-", alpha=alpha) tfax.plot(intervals, arrdiv(fn, cump), "c--", alpha=alpha) tfax.plot(intervals, arrdiv(fp, cumn), "b-", alpha=alpha) tfax.plot(intervals, arrdiv(tn, cumn), "m--", alpha=alpha) tfax.plot([cutoff, cutoff], [0.0, 1.0], "k--") tfax.legend(('TP', 'FN', 'FP', 'TN'), 'upper center', ncol=4, frameon=False, prop={'size':8}) # MCC/Accuracy aax.grid() aax.plot(intervals, mcc, "g-", alpha=alpha) aax.plot(intervals, accuracy, "y-", alpha=alpha) aax.plot([cutoff, cutoff], [0.0, 1.0], "k--") aax.legend(('MCC', 'Accuracy'), 'upper center', ncol=2, frameon=False, prop={'size':10}) if args.out_path is not None: from matplotlib import pylab log.info("Saving image into {} ...".format(os.path.basename(args.out_path))) pylab.savefig(args.out_path, bbox_inches=0) if args.interactive: plt.show()
def main(): parser = argparse.ArgumentParser( description="Calculate weights") parser.add_argument("ranges_path", metavar="RANGES_PATH", help="JSON file generated with pred-list containing predictors stats. Only min and max are used.") parser.add_argument("training_path", metavar="TRAINING_PATH", help="The training set scores. ID column should be POS/NEG for positive/negative sets.") parser.add_argument("-o", dest="out_path", metavar="OUT_PATH", help="The file where weights will be saved. Use - for standard output.") parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS", help="Comma separated list of predictors to fetch") parser.add_argument("-P", "--precision", dest="precision", metavar="PRECISSION", type=int, default=3, help="Distribution precision") parser.add_argument("-f", "--full-state", dest="full_state", action="store_true", default=False, help="Save intermediate calculations to allow further exploration and plotting") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) logger = bglogging.get_logger("weights") if args.out_path is None: prefix = os.path.splitext(os.path.basename(args.training_path))[0] if prefix.endswith("-scores"): prefix = prefix[:-7] args.out_path = os.path.join(os.getcwd(), "{}-weights.json".format(prefix)) if args.predictors is not None: args.predictors = [p.strip() for p in args.predictors.split(",")] logger.info("Loading ranges from {} ...".format(os.path.basename(args.ranges_path))) with open(args.ranges_path) as f: pred_stats = json.load(f) predictor_range = {} for pid, pstats in pred_stats.items(): predictor_range[pid] = (pstats["min"], pstats["max"]) logger.info("Reading training set {} ...".format(args.training_path if args.training_path != "-" else "from standard input")) with tsv.open(args.training_path) as f: # Select predictors from the available predictors in the dataset or user selection column_names, column_indices = tsv.header(f) available_predictors = [c for c in column_names if c not in set(COORD_COLUMNS)] if args.predictors is None: predictors = available_predictors else: missing_predictors = [p for p in args.predictors if p not in set(available_predictors)] if len(missing_predictors) > 0: logger.error("Missing predictors: {}".format(", ".join(missing_predictors))) exit(-1) predictors = args.predictors data = pd.read_csv(args.training_path, sep="\t", index_col=False, usecols=["ID"] + predictors, true_values=["POS"], false_values=["NEG"]) data.rename(columns={"ID" : "EVT"}, inplace=True) # Initialize statistics logger.info("Initializing metrics ...") step = 1.0 / 10**args.precision stats = dict() state = dict( predictor_names = predictors, precision = args.precision, step = step, stats = stats) for predictor in predictors: d = data[["EVT", predictor]] d = d[np.isfinite(d.iloc[:, 1])] nump = d.iloc[:, 0].sum() numn = d.shape[0] - nump rmin, rmax = d.iloc[:, 1].min(), d.iloc[:, 1].max() dim = rmax - rmin size = int(dim / step) + 1 values = [(x * step) + rmin for x in xrange(size)] logger.info(" {:10}: p={}, n={}, min={}, max={}, bins={}".format(predictor, nump, numn, rmin, rmax, size)) stats[predictor] = dict( rmin = rmin, rmax = rmax, dim = dim, values = values, size = size, vmin = rmin, vmax = rmax, dp = [0] * size, dn = [0] * size, cdp = [0] * size, cdn = [0] * size, cump = 0, cumn = 0, tp = [0] * size, tn = [0] * size, fp = [0] * size, fn = [0] * size, mcc = [0] * size, acc = [0] * size, auc = [0] * size, cutoff = None, cutoff_index = None, cutoff_mcc = None, cutoff_acc = None, cutoff_auc = None) positive_count = data.iloc[:, 0].sum() negative_count = data.shape[0] - positive_count logger.info(" TOTAL : positive={}, negative={}".format(positive_count, negative_count)) logger.info("Calculating scores distribution and confusion matrices ...") logger.info("Calculating cumulative distribution ...") for predictor in predictors: predictor_stats = stats[predictor] dp, dn, cdp, cdn = [predictor_stats[k] for k in ["dp", "dn", "cdp", "cdn"]] cump = 0 cumn = 0 i = len(dp) - 1 while i >= 0: cdp[i] = dp[i] + cump cump += dp[i] cdn[i] = dn[i] + cumn cumn += dn[i] i -= 1 predictor_stats["cump"] = cump predictor_stats["cumn"] = cumn logger.info(" {}: cump={}, cumn={}".format(predictor, cump, cumn)) logger.info("Calculating accuracy and cutoff ...") for predictor in predictors: predictor_stats = stats[predictor] values, size, tp, tn, fp, fn, mcc, acc = [predictor_stats[k] for k in [ "values", "size", "tp", "tn", "fp", "fn", "mcc", "acc"]] cutoff = -1 cutoff_index = -1 best_mcc = -1e6 for i in xrange(size): try: #http://en.wikipedia.org/wiki/Matthews_correlation_coefficient mcc[i] = (tp[i] * tn[i] - fp[i] * fn[i]) / sqrt((tp[i] + fp[i]) * (tp[i] + fn[i]) * (tn[i] + fp[i]) * (tn[i] + fn[i])) #http://en.wikipedia.org/wiki/Accuracy acc[i] = (tp[i] + tn[i]) / float(tp[i] + fp[i] + fn[i] + tn[i]) except ZeroDivisionError: mcc[i] = 0 acc[i] = 0 if mcc[i] > best_mcc: cutoff = values[i] cutoff_index = i best_mcc = mcc[i] best_acc = max(acc) predictor_stats["cutoff"] = cutoff predictor_stats["cutoff_index"] = cutoff_index predictor_stats["cutoff_mcc"] = best_mcc predictor_stats["cutoff_acc"] = best_acc logger.info(" {}: cutoff={:.3f}, mcc={:.2f}, accuracy={:.2f}".format( predictor, cutoff, best_mcc * 100.0, best_acc * 100.0)) if args.full_state: logger.info("Saving weights with full state ...") out_path = args.out_path save_weights(out_path, state) else: logger.info("Saving weights ...") stats = {} reduced_state = dict( predictor_names=state["predictor_names"], precision=state["precision"], step=state["step"], stats=stats) for predictor in state["predictor_names"]: predictor_stats = state["stats"][predictor] stats[predictor] = dict( rmin=predictor_stats["rmin"], rmax=predictor_stats["rmax"], dim=predictor_stats["dim"], values=predictor_stats["values"], size=predictor_stats["size"], cdp=predictor_stats["cdp"], cdn=predictor_stats["cdn"], cutoff=predictor_stats["cutoff"], cutoff_index=predictor_stats["cutoff_index"]) save_weights(args.out_path, reduced_state) return 0
def main(): parser = argparse.ArgumentParser( description="Calculate Condel label") parser.add_argument("db_path", metavar="DB_PATH", help="Functional scores database") parser.add_argument("weights_path", metavar="WEIGHTS", help="File containing the scores weights and cutoffs") parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS", help="Comma separated list of predictors") parser.add_argument("-u", "--updated-predictors", dest="updated_predictors", metavar="NAMES", help="Updated predictor names") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) log = bglogging.get_logger("calculate-label") log.info("Opening functional scores database ...") db = FannsSQLiteDb(args.db_path) db.open() log.info("Loading state ...") state = load_weights(args.weights_path) avail_predictors, precision, step, stats = [state[k] for k in ["predictor_names", "precision", "step", "stats"]] if args.predictors is not None: predictors = [p for p in [p.strip() for p in args.predictors.split(",")] if p in avail_predictors] if len(predictors) == 0: log.error("Unknown predictors: {}".format(args.predictors)) log.error("Available predictor names are: {}".format(", ".join(avail_predictors))) exit(-1) else: predictors = avail_predictors if args.updated_predictors is not None: updated_predictors = [p.strip() for p in args.updated_predictors.split(",")] if len(predictors) != len(updated_predictors): log.error("Number of updated predictors does not match with the list of number of predictors") exit(-1) else: updated_predictors = ["{}_CLASS".format(p.upper()) for p in predictors] log.info("Available predictors: {}".format(", ".join(avail_predictors))) log.info("Selected predictors: {}".format(", ".join(predictors))) for predictor, updated_predictor in zip(predictors, updated_predictors): log.info("Creating predictor {} ...".format(updated_predictor)) db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor]) cutoffs = [] for predictor in predictors: cutoff, mcc, acc = [stats[predictor][v] for v in ["cutoff", "cutoff_mcc", "cutoff_acc"]] log.info("{}: cutoff={}, MCC={}, accuracy={}".format(predictor, cutoff, mcc, acc)) cutoffs += [cutoff] log.info("Calculating ...") start_time = partial_start_time = time.time() try: for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1): scores = row["scores"] d = {} for i, predictor in enumerate(predictors): score = scores[predictor] if score is None: continue cutoff = cutoffs[i] updated_predictor = updated_predictors[i] d[updated_predictor] = 0.0 if score < cutoff else 1.0 db.update_scores(row["id"], d) partial_time = time.time() - partial_start_time if partial_time > 5.0: partial_start_time = time.time() elapsed_time = time.time() - start_time log.debug(" {} rows, {:.1f} rows/second".format(hsize(num_rows), num_rows / elapsed_time)) db.commit() except KeyboardInterrupt: log.warn("Interrupted by Ctrl-C") db.rollback() except: db.rollback() raise finally: db.close()
def main(): parser = argparse.ArgumentParser( description="Plot training sets statistics") parser.add_argument("path", metavar="PATH", help="The statistics json file") parser.add_argument("-o", dest="out_path", metavar="PATH", help="The path to save the plot image.") parser.add_argument("-p", "--predictors", dest="predictor_names", metavar="NAMES", help="The names of the predictors to represent (seppareted by commas).") parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False, help="Show the plot in interactive mode.") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) log = bglogging.get_logger("plot-stats") log.info("Loading state ...") state = load_weights(args.path) predictor_names, stats = [state[k] for k in ["predictor_names", "stats"]] if args.predictor_names is not None: valid_names = set(predictor_names) args.predictor_names = [s.strip() for s in args.predictor_names.split(",")] predictor_names = [name for name in args.predictor_names if name in valid_names] if len(predictor_names) == 0: log.error("No scores selected. Please choose between: {}".format(", ".join(valid_names))) exit(-1) log.info("Plotting ...") fig = plt.figure() ax = fig.add_subplot(111) ax.grid() ax.set_xlabel("False Positive Rate (1 - especificity)") ax.set_ylabel("True Positive Rate (sensitivity)") num_predictors = len(predictor_names) for predictor_name in predictor_names: predictor_stats = stats[predictor_name] (size, tp, tn, fp, fn) = [predictor_stats[k] for k in ["size", "tp", "tn", "fp", "fn"]] tpr = [1.0] * (size + 1) fpr = [1.0] * (size + 1) for i in range(size): tpr[i + 1] = (float(tp[i]) / (tp[i] + fn[i])) fpr[i + 1] = (float(fp[i]) / (fp[i] + tn[i])) ax.plot(fpr, tpr, "-") ax.legend(tuple(predictor_names), "lower right", shadow=False) ax.plot([0.0, 1.0], [0.0, 1.0], "--", color="0.75") if args.out_path is not None: from matplotlib import pylab pylab.savefig(args.out_path, bbox_inches=0) if args.interactive: plt.show()
def main(): parser = argparse.ArgumentParser( description="Calculate Condel score") parser.add_argument("db_path", metavar="DB_PATH", help="Functional scores database") parser.add_argument("weights_path", metavar="WEIGHTS", help="File containing the scores weights and cutoffs") parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS", help="Comma separated list of predictors") parser.add_argument("-u", "--updated-predictor", dest="updated_predictor", metavar="NAME", help="Updated predictor name") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) log = bglogging.get_logger("calculate") log.info("Opening functional scores database ...") db = FannsSQLiteDb(args.db_path) db.open() updated_predictor = args.updated_predictor or "CONDEL" predictors = set([p["id"] for p in db.predictors()]) if updated_predictor not in predictors: log.info(" Creating predictor {} ...".format(updated_predictor)) db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=predictors) log.info("Loading state ...") state = load_weights(args.weights_path) avail_predictors, precision, step, stats = [state[k] for k in ["predictor_names", "precision", "step", "stats"]] if args.predictors is not None: predictors = [p for p in [p.strip() for p in args.predictors.split(",")] if p in avail_predictors] if len(predictors) == 0: log.error("Unknown predictors: {}".format(args.predictors)) log.error("Available predictor names are: {}".format(", ".join(avail_predictors))) exit(-1) else: predictors = avail_predictors log.info("Available predictors: {}".format(", ".join(avail_predictors))) log.info("Selected predictors: {}".format(", ".join(predictors))) log.info("Calculating ...") start_time = partial_start_time = time.time() try: for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1): scores = row["scores"] condel = wsum = 0 for predictor, score in scores.items(): if score is None: continue predictor_stats = stats[predictor] rmin, rmax, dim, size, cdp, cdn, cutoff = [predictor_stats[k] for k in [ "rmin", "rmax", "dim", "size", "cdp", "cdn", "cutoff"]] if predictor in PREDICTOR_TRANSFORM: score = PREDICTOR_TRANSFORM[predictor](score) r = (score - rmin) / dim index = int(r * size) if score < rmax else size - 1 if score < cutoff: w = 1 - cdn[index] else: w = 1 - cdp[index] wsum += w condel += w * score #log.info("{}={}, w={} -> {}".format(predictor_name, score, w, score * w)) if wsum != 0: condel /= wsum d = {updated_predictor : condel} db.update_scores(row["id"], d) #log.info(">>> CONDEL={}".format(condel)) else: log.warn("wsum = 0, condel={}, scores={}".format(condel, repr(scores))) partial_time = time.time() - partial_start_time if partial_time > 5.0: partial_start_time = time.time() elapsed_time = time.time() - start_time log.debug(" {} rows, {:.1f} rows/second".format(hsize(num_rows), num_rows / elapsed_time)) log.info("Commit ...") db.commit() except KeyboardInterrupt: log.warn("Interrupted by Ctrl-C") db.rollback() except: db.rollback() raise finally: db.close()
def main(): parser = argparse.ArgumentParser( description="Prepare SNV's dataset from individual training sets") parser.add_argument("pos_path", metavar="POS_SET", help="The positive training set file") parser.add_argument("neg_path", metavar="NEG_SET", help="The negative training set file") parser.add_argument("-m", "--map", dest="map_path", metavar="MAP", help="Optional mapping file for feature id's. Format: DST SRC") parser.add_argument("-o", dest="out_path", metavar="PATH", help="Output file. Use - for standard output.") bglogging.add_logging_arguments(parser) args = parser.parse_args() bglogging.initialize(args) logger = bglogging.get_logger("training-sets") if args.out_path is None: prefix = os.path.commonprefix([ os.path.splitext(os.path.basename(args.pos_path))[0], os.path.splitext(os.path.basename(args.neg_path))[0]]) prefix = prefix.rstrip(".") args.out_path = os.path.join(os.getcwd(), "{}-training.tsv".format(prefix)) if args.map_path is not None: logger.info("Loading map ...") prot_map = {} with tsv.open(args.map_path) as f: for dst_feature, src_feature in tsv.lines(f, (str, str)): if len(src_feature) > 0: if src_feature not in prot_map: prot_map[src_feature] = set([dst_feature]) else: prot_map[src_feature].add(dst_feature) else: prot_map = None logger.info("Processing ...") hits = dict(POS=0, NEG=0) fails = dict(POS=0, NEG=0) start_time = datetime.now() with tsv.open(args.out_path, "w") as wf: for event_type, path in (("POS", args.pos_path), ("NEG", args.neg_path)): logger.info(" [{}] Reading {} ...".format(event_type, path)) with tsv.open(path) as f: types = (str, int, str, str) for protein, pos, aa1, aa2 in tsv.lines(f, types): protein = protein.strip() if prot_map is not None: if protein not in prot_map: logger.debug("[{}] Unmapped protein: {}".format(event_type, protein)) fails[event_type] += 1 continue proteins = prot_map[protein] else: proteins = [protein] hits[event_type] += 1 for p in proteins: tsv.write_line(wf, p, pos, aa1.strip(), aa2.strip(), event_type) logger.info(" POS NEG") logger.info("SNVs {POS:>8} {NEG:>8}".format(**hits)) if args.map_path is not None: logger.info("unmapped {POS:>8} {NEG:>8}".format(**fails)) logger.info("Finished. Elapsed time: {}".format(datetime.now() - start_time))