def main(): parser = argparse.ArgumentParser( description="Calculate TransFIC labels") cmd = Command.withtraits(DbTrait, PredictorsInDbTrait)(parser) cmd.add_db_args() parser.add_argument("cutoffs_path", metavar="CUTOFFS", help="File containing the cutoffs") cmd.add_selected_predictors_args() parser.add_argument("-u", "--updated-predictors", dest="updated_predictors", metavar="NAMES", help="Updated predictor names") args, logger = cmd.parse_args("calc-label") db = cmd.open_db() try: logger.info("Loading state ...") state = load_weights(args.cutoffs_path) avail_predictors, stats = [state[k] for k in ["predictors", "stats"]] predictors = cmd.get_selected_predictors(default_all=True) missing_predictors = [p for p in predictors if p not in set(avail_predictors)] if len(missing_predictors) > 0: raise Exception("Missing cutoff stats for predictors: {}".format(", ".join(missing_predictors))) if args.updated_predictors is not None: if len(predictors) != len(args.updated_predictors): raise Exception("The number of selected predictors does not match the number of predictor names to update") updated_predictors = dict([(p, u) for p, u in zip(predictors, args.updated_predictors)]) else: updated_predictors = dict([(p, "{}_LABEL".format(p)) for p in predictors]) # create predictors in the database if required db_predictors = set([p["id"] for p in db.predictors()]) for predictor, updated_predictor in updated_predictors.items(): if updated_predictor not in db_predictors: logger.info("Creating predictor {} ...".format(updated_predictor)) db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor]) cutoffs = {} for predictor in predictors: cutoff_low_mid, cutoff_mid_high = [stats[predictor][v] for v in ["cutoff_low_mid", "cutoff_mid_high"]] logger.info("{}: cutoffs: low_mid={}, mid_high={}".format(predictor, cutoff_low_mid, cutoff_mid_high)) cutoffs[predictor] = (cutoff_low_mid, cutoff_mid_high) logger.info("Calculating ...") progress = RatedProgress(logger, name="SNVs") for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1): scores = row["scores"] uscores = {} for predictor in predictors: score = scores[predictor] if score is None: continue cutoff_low_mid, cutoff_mid_high = cutoffs[predictor] updated_predictor = updated_predictors[predictor] uscores[updated_predictor] = 0.0 if score < cutoff_low_mid else 1.0 if score < cutoff_mid_high else 2.0 if len(uscores) > 0: db.update_scores(row["id"], uscores) progress.update() db.commit() except: cmd.handle_error() return 0
def main(): parser = argparse.ArgumentParser( description="Import scores into the database") cmd = Command.withtraits(DbTrait, PredictorsTrait)(parser) cmd.add_db_args() parser.add_argument("source_path", metavar="SOURCE", help="The source file. Use - for standard input.") #TODO: which are the coordinates column cmd.add_selected_predictors_args() parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False, help="Skip SNV's where all the scores are empty") parser.add_argument("--skip-update-predictors", dest="skip_update_predictors", action="store_true", default=False, help="Skip the update of the predictors.") parser.add_argument("--skip-create-index", dest="skip_create_index", action="store_true", default=False, help="Skip the creation of the database indices.") parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False, help="When errors on the input file, report them but continue processing the input.") args, logger = cmd.parse_args("import") db = cmd.open_db() try: progress = RatedProgress(logger, name="SNVs") total_lines = 0 logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input")) with tsv.open(args.source_path) as f: # Parse header hdr_line = f.readline() hdr = {} for index, name in enumerate(hdr_line.rstrip("\n").split("\t")): hdr[name] = index # Predictors to update from the user selection and source availability db_predictors = set([p["id"] for p in db.predictors()]) src_predictors = [name for name in hdr if name not in COORD_COLUMNS] predictors = cmd.get_selected_predictors(available_predictors=src_predictors) for predictor in predictors: if predictor not in db_predictors: logger.info("Creating non existing predictor: {}".format(predictor)) db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE) logger.info("Predictors: {}".format(", ".join(predictors))) all_columns = COORD_COLUMNS + predictors types = COORD_TYPES + ([score_value] * len(predictors)) missing_columns = [name for name in all_columns if name not in hdr] if len(missing_columns) > 0: raise Exception("The following columns are missing: {}".format(", ".join(missing_columns))) columns = [hdr[name] for name in all_columns] max_column = max(columns) for line_num, line in enumerate(f, start=2): fields = line.rstrip("\n").split("\t") if len(fields) < max_column: log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields))) if not args.ignore_errors: raise try: fields = [type_cast(fields[index]) for type_cast, index in zip(types, columns)] except Exception as ex: logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields))) if not args.ignore_errors: raise (chr, strand, start, ref, alt, transcript, aa_pos, aa_ref, aa_alt, protein) = fields[:10] scores = fields[10:] if args.skip_empty_scores and sum([0 if s is None else 1 for s in scores]) == 0: continue try: db.add_snv( chr=chr, strand=strand, start=start, ref=ref, alt=alt, transcript=transcript, protein=protein, aa_pos=aa_pos, aa_ref=aa_ref, aa_alt=aa_alt, scores=dict(zip(predictors, scores))) except Exception as ex: logger.error("Error importing SNV at line {}: {}".format(line_num, str(ex))) if not args.ignore_errors: raise progress.update() total_lines += line_num progress.log_totals() logger.info("Finalizing database ...") if not args.skip_update_predictors: logger.info("Updating predictors ...") db.update_predictors() logger.info("Committing ...") db.commit() if not args.skip_create_index: logger.info("Creating indices ...") db.create_indices() logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser(description="Calculate TransFIC for the selected scores") cmd = Command.withtraits(DbTrait, PredictorsInDbTrait, TransformsTrait)(parser) cmd.add_db_args() parser.add_argument( "feature_name", metavar="FEATURE_COLUMN", help="The column name with the features. It can be transcript, protein or any of the available annotations.", ) parser.add_argument("blt_path", metavar="BLT_PATH", help="The baseline tolerance statistics.") cmd.add_selected_predictors_args() parser.add_argument( "-u", "--updated-predictors", dest="updated_predictors", metavar="NAME", help="Updated predictor names" ) cmd.add_transform_args() args, logger = cmd.parse_args("calc") db = cmd.open_db() # initialize feature selection db_annotations = [a["id"] for a in db.maps()] if args.feature_name not in set(["transcript", "protein"] + db_annotations): logger.error("Feature name not available in the database: {}".format(args.feature_name)) logger.error("Available annotations: {}".format(", ".join(db_annotations))) exit(-1) if args.feature_name.lower() in ["transcript", "protein"]: annotations = None feature_getter = lambda row: row[args.feature_name] else: annotations = [args.feature_name] feature_getter = lambda row: row["annotations"][args.feature_name] # predictors, transforms, and updated_predictors predictors = cmd.get_selected_predictors(default_all=True) transforms = cmd.get_transforms() if args.updated_predictors is not None: if len(predictors) != len(args.updated_predictors): logger.error("The number of selected predictors does not match the number of predictor names to update") exit(-1) updated_predictors = dict([(p, u) for p, u in zip(predictors, args.updated_predictors)]) else: updated_predictors = dict([(p, "TFIC_{}".format(p)) for p in predictors]) # create predictors in the database if required db_predictors = set([p["id"] for p in db.predictors()]) for predictor, updated_predictor in updated_predictors.items(): if updated_predictor not in db_predictors: logger.info("Creating predictor {} ...".format(updated_predictor)) db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor]) try: logger.info("Loading baseline tolerance statistics ...") with tsv.open(args.blt_path) as f: doc = json.load(f) blt_predictors = doc["predictors"] features = doc["features"] blt_stats = doc["blt"] num_predictors = len(blt_predictors) logger.info(" Predictors: {}".format(", ".join(blt_predictors))) logger.info(" Features: {}".format(len(features))) logger.info("Calculating ...") progress = RatedProgress(logger, name="SNVs") rows_count = updated_count = 0 for row in db.query_scores(predictors=predictors, maps=annotations): rows_count += 1 scores = row["scores"] feature = feature_getter(row) if feature not in blt_stats: continue feature_stats = blt_stats[feature] tfic_scores = calculate_tfic(predictors, updated_predictors, feature_stats, scores, transforms) if len(tfic_scores) > 0: db.update_scores(row["id"], tfic_scores) updated_count += 1 progress.update() progress.log_totals() logger.info("Commit ...") db.commit() logger.info( "Finished. Total rows = {}, updated rows = {}, elapsed_time = {}".format( rows_count, updated_count, progress.elapsed_time ) ) except: cmd.handle_error() return 0
import os from bgcore import logging as bglogging from fannsdb.cmdhelper import Command, PredictorsTrait, TransformsTrait DefaultCommandHelper = Command.withtraits(PredictorsTrait, TransformsTrait)
def main(): parser = argparse.ArgumentParser( description="Calculate TransFIC for the selected scores") cmd = Command(parser) parser.add_argument("weights", metavar="WEIGHTS_PATH", nargs="+", help="The list of files containing weights.") parser.add_argument("-c", dest="comparisons", metavar="S1/S2", action="append", help="Compare score S1 with S2") parser.add_argument("-o", dest="out_path", metavar="PATH", help="The path to save the plot image.") parser.add_argument("-W", "--width", dest="fig_width", metavar="WIDTH", type=int, help="The image width.") parser.add_argument("-H", "--height", dest="fig_height", metavar="HEIGHT", type=int, help="The image height.") parser.add_argument("--dpi", dest="fig_dpi", metavar="DPI", type=int, default=100, help="The image dpi.") parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False, help="Show the plot in interactive mode.") parser.add_argument("-t", "--title", dest="title", metavar="TITLE", help="The plot title.") args, logger = cmd.parse_args("perf-plot") try: width = 0.5 if args.comparisons is None: raise Exception("Select the predictors to compare with -c please.") comparisons = [] for c in args.comparisons: s = c.split("/") if len(s) != 2: raise Exception("Wrong comparison format: {}".format(c)) comparisons += [tuple(s)] num_comparisons = len(comparisons) if not args.interactive: import matplotlib matplotlib.use('Agg') from matplotlib import pylab #pylab.rcParams['xtick.major.pad'] = '8' pylab.rcParams['ytick.major.pad'] = '20' fig = plt.figure(figsize=(args.fig_width or 12, args.fig_height or 10.4), dpi=args.fig_dpi or 100) fig.subplots_adjust(left=0.11, bottom=0.03, right=0.99, top=0.94, wspace=0.07, hspace=0.53) if args.title is not None: fig.suptitle(args.title) num_rows = len(args.weights) for row_idx, wpath in enumerate(args.weights, start=0): basename = os.path.basename(wpath) name = basename if name.endswith(".json"): name = name[:-5] if name.endswith("-weights"): name = name[:-8] i = name.find("__") if i > 0: name1, name2 = name.split("__")[:2] else: name1 = name2 = name logger.info("{} ...".format(name)) with open(wpath) as f: state = json.load(f) metrics = state["metrics"] x = np.arange(0, 100, 10) y = np.arange(2) j = 0 for p1, p2 in comparisons: if p1 not in metrics: raise Exception("Predictor '{}' not found in statistics file {}".format(p1, basename)) if p2 not in metrics: raise Exception("Predictor '{}' not found in statistics file {}".format(p2, basename)) stats1 = metrics[p1] stats2 = metrics[p2] mcc1 = stats1["best_perf"]["MCC"] * 100.0 mcc2 = stats2["best_perf"]["MCC"] * 100.0 row_title = "{}\n{}".format(name1, name2) col_title = p1 if row_idx == 0 else "" ax = fig.add_subplot(num_rows, num_comparisons, row_idx * num_comparisons + j + 1, title=col_title) ax.set_xticks(x) ax.set_xlim(0, 100) ax.xaxis.grid(True) ax.set_yticks([0.25, 0.75]) if j == 0: ax.set_ylabel(row_title, rotation=90) ax.set_yticklabels(["Orig", "TFIC"]) else: ax.set_yticklabels(["", ""]) b1 = ax.barh([0.0], [mcc1], width, color="r") b2 = ax.barh([0.5], [mcc2], width, color="y") j += 1 #ax.set_xticklabels(tuple([p1 for p1, p2 in comparisons])) #ax.legend((b1[0], b2[0]), ("Original score", "TransFIC score")) if args.out_path is not None: plt.savefig(args.out_path, bbox_inches=0) if args.interactive: plt.show() except: cmd.handle_error() return 0