예제 #1
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate TransFIC labels")

	cmd = Command.withtraits(DbTrait, PredictorsInDbTrait)(parser)
	
	cmd.add_db_args()
	
	parser.add_argument("cutoffs_path", metavar="CUTOFFS",
						help="File containing the cutoffs")

	cmd.add_selected_predictors_args()

	parser.add_argument("-u", "--updated-predictors", dest="updated_predictors", metavar="NAMES",
						help="Updated predictor names")

	args, logger = cmd.parse_args("calc-label")

	db = cmd.open_db()

	try:
		logger.info("Loading state ...")

		state = load_weights(args.cutoffs_path)

		avail_predictors, stats = [state[k] for k in ["predictors", "stats"]]

		predictors = cmd.get_selected_predictors(default_all=True)
		missing_predictors = [p for p in predictors if p not in set(avail_predictors)]
		if len(missing_predictors) > 0:
			raise Exception("Missing cutoff stats for predictors: {}".format(", ".join(missing_predictors)))

		if args.updated_predictors is not None:
			if len(predictors) != len(args.updated_predictors):
				raise Exception("The number of selected predictors does not match the number of predictor names to update")
			updated_predictors = dict([(p, u) for p, u in zip(predictors, args.updated_predictors)])
		else:
			updated_predictors = dict([(p, "{}_LABEL".format(p)) for p in predictors])

		# create predictors in the database if required

		db_predictors = set([p["id"] for p in db.predictors()])
		for predictor, updated_predictor in updated_predictors.items():
			if updated_predictor not in db_predictors:
				logger.info("Creating predictor {} ...".format(updated_predictor))
				db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor])

		cutoffs = {}
		for predictor in predictors:
			cutoff_low_mid, cutoff_mid_high = [stats[predictor][v] for v in ["cutoff_low_mid", "cutoff_mid_high"]]
			logger.info("{}: cutoffs: low_mid={}, mid_high={}".format(predictor, cutoff_low_mid, cutoff_mid_high))
			cutoffs[predictor] = (cutoff_low_mid, cutoff_mid_high)

		logger.info("Calculating ...")

		progress = RatedProgress(logger, name="SNVs")

		for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1):
			scores = row["scores"]
			uscores = {}
			for predictor in predictors:
				score = scores[predictor]
				if score is None:
					continue

				cutoff_low_mid, cutoff_mid_high = cutoffs[predictor]
				updated_predictor = updated_predictors[predictor]

				uscores[updated_predictor] = 0.0 if score < cutoff_low_mid else 1.0 if score < cutoff_mid_high else 2.0

			if len(uscores) > 0:
				db.update_scores(row["id"], uscores)

			progress.update()

		db.commit()
	except:
		cmd.handle_error()

	return 0
예제 #2
0
def main():
	parser = argparse.ArgumentParser(
		description="Import scores into the database")

	cmd = Command.withtraits(DbTrait, PredictorsTrait)(parser)

	cmd.add_db_args()

	parser.add_argument("source_path", metavar="SOURCE",
						help="The source file. Use - for standard input.")

	#TODO: which are the coordinates column

	cmd.add_selected_predictors_args()

	parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False,
						help="Skip SNV's where all the scores are empty")

	parser.add_argument("--skip-update-predictors", dest="skip_update_predictors", action="store_true", default=False,
						help="Skip the update of the predictors.")

	parser.add_argument("--skip-create-index", dest="skip_create_index", action="store_true", default=False,
						help="Skip the creation of the database indices.")

	parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False,
						help="When errors on the input file, report them but continue processing the input.")

	args, logger = cmd.parse_args("import")

	db = cmd.open_db()

	try:
		progress = RatedProgress(logger, name="SNVs")

		total_lines = 0

		logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input"))

		with tsv.open(args.source_path) as f:
			# Parse header
			hdr_line = f.readline()
			hdr = {}
			for index, name in enumerate(hdr_line.rstrip("\n").split("\t")):
				hdr[name] = index

			# Predictors to update from the user selection and source availability
			db_predictors = set([p["id"] for p in db.predictors()])
			src_predictors = [name for name in hdr if name not in COORD_COLUMNS]
			predictors = cmd.get_selected_predictors(available_predictors=src_predictors)
			for predictor in predictors:
				if predictor not in db_predictors:
					logger.info("Creating non existing predictor: {}".format(predictor))
					db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE)

			logger.info("Predictors: {}".format(", ".join(predictors)))

			all_columns = COORD_COLUMNS + predictors
			types = COORD_TYPES + ([score_value] * len(predictors))

			missing_columns = [name for name in all_columns if name not in hdr]
			if len(missing_columns) > 0:
				raise Exception("The following columns are missing: {}".format(", ".join(missing_columns)))

			columns = [hdr[name] for name in all_columns]
			max_column = max(columns)

			for line_num, line in enumerate(f, start=2):
				fields = line.rstrip("\n").split("\t")

				if len(fields) < max_column:
					log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				try:
					fields = [type_cast(fields[index]) for type_cast, index in zip(types, columns)]
				except Exception as ex:
					logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields)))
					if not args.ignore_errors:
						raise

				(chr, strand, start, ref, alt, transcript,
				 aa_pos, aa_ref, aa_alt, protein) = fields[:10]

				scores = fields[10:]

				if args.skip_empty_scores and sum([0 if s is None else 1 for s in scores]) == 0:
					continue

				try:
					db.add_snv(
								chr=chr, strand=strand, start=start, ref=ref, alt=alt, transcript=transcript,
								protein=protein, aa_pos=aa_pos, aa_ref=aa_ref, aa_alt=aa_alt,
								scores=dict(zip(predictors, scores)))
				except Exception as ex:
					logger.error("Error importing SNV at line {}: {}".format(line_num, str(ex)))
					if not args.ignore_errors:
						raise

				progress.update()

			total_lines += line_num

		progress.log_totals()

		logger.info("Finalizing database ...")

		if not args.skip_update_predictors:
			logger.info("Updating predictors ...")
			db.update_predictors()

		logger.info("Committing ...")
		db.commit()

		if not args.skip_create_index:
			logger.info("Creating indices ...")
			db.create_indices()

		logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time))

	except:
		return cmd.handle_error()
	finally:
		db.close()

	return 0
예제 #3
0
def main():
    parser = argparse.ArgumentParser(description="Calculate TransFIC for the selected scores")

    cmd = Command.withtraits(DbTrait, PredictorsInDbTrait, TransformsTrait)(parser)

    cmd.add_db_args()

    parser.add_argument(
        "feature_name",
        metavar="FEATURE_COLUMN",
        help="The column name with the features. It can be transcript, protein or any of the available annotations.",
    )

    parser.add_argument("blt_path", metavar="BLT_PATH", help="The baseline tolerance statistics.")

    cmd.add_selected_predictors_args()

    parser.add_argument(
        "-u", "--updated-predictors", dest="updated_predictors", metavar="NAME", help="Updated predictor names"
    )

    cmd.add_transform_args()

    args, logger = cmd.parse_args("calc")

    db = cmd.open_db()

    # initialize feature selection

    db_annotations = [a["id"] for a in db.maps()]
    if args.feature_name not in set(["transcript", "protein"] + db_annotations):
        logger.error("Feature name not available in the database: {}".format(args.feature_name))
        logger.error("Available annotations: {}".format(", ".join(db_annotations)))
        exit(-1)

    if args.feature_name.lower() in ["transcript", "protein"]:
        annotations = None
        feature_getter = lambda row: row[args.feature_name]
    else:
        annotations = [args.feature_name]
        feature_getter = lambda row: row["annotations"][args.feature_name]

        # predictors, transforms, and updated_predictors

    predictors = cmd.get_selected_predictors(default_all=True)

    transforms = cmd.get_transforms()

    if args.updated_predictors is not None:
        if len(predictors) != len(args.updated_predictors):
            logger.error("The number of selected predictors does not match the number of predictor names to update")
            exit(-1)
        updated_predictors = dict([(p, u) for p, u in zip(predictors, args.updated_predictors)])
    else:
        updated_predictors = dict([(p, "TFIC_{}".format(p)) for p in predictors])

        # create predictors in the database if required

    db_predictors = set([p["id"] for p in db.predictors()])
    for predictor, updated_predictor in updated_predictors.items():
        if updated_predictor not in db_predictors:
            logger.info("Creating predictor {} ...".format(updated_predictor))
            db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor])

    try:
        logger.info("Loading baseline tolerance statistics ...")

        with tsv.open(args.blt_path) as f:
            doc = json.load(f)
            blt_predictors = doc["predictors"]
            features = doc["features"]
            blt_stats = doc["blt"]
            num_predictors = len(blt_predictors)

        logger.info("  Predictors: {}".format(", ".join(blt_predictors)))
        logger.info("  Features: {}".format(len(features)))

        logger.info("Calculating ...")

        progress = RatedProgress(logger, name="SNVs")

        rows_count = updated_count = 0
        for row in db.query_scores(predictors=predictors, maps=annotations):
            rows_count += 1

            scores = row["scores"]

            feature = feature_getter(row)
            if feature not in blt_stats:
                continue

            feature_stats = blt_stats[feature]

            tfic_scores = calculate_tfic(predictors, updated_predictors, feature_stats, scores, transforms)

            if len(tfic_scores) > 0:
                db.update_scores(row["id"], tfic_scores)
                updated_count += 1

            progress.update()

        progress.log_totals()

        logger.info("Commit ...")

        db.commit()

        logger.info(
            "Finished. Total rows = {}, updated rows = {}, elapsed_time = {}".format(
                rows_count, updated_count, progress.elapsed_time
            )
        )

    except:
        cmd.handle_error()

    return 0
예제 #4
0
import os

from bgcore import logging as bglogging

from fannsdb.cmdhelper import Command, PredictorsTrait, TransformsTrait

DefaultCommandHelper = Command.withtraits(PredictorsTrait, TransformsTrait)
예제 #5
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate TransFIC for the selected scores")

	cmd = Command(parser)

	parser.add_argument("weights", metavar="WEIGHTS_PATH", nargs="+",
						help="The list of files containing weights.")

	parser.add_argument("-c", dest="comparisons", metavar="S1/S2", action="append",
						help="Compare score S1 with S2")

	parser.add_argument("-o", dest="out_path", metavar="PATH",
						help="The path to save the plot image.")

	parser.add_argument("-W", "--width", dest="fig_width", metavar="WIDTH", type=int,
						help="The image width.")

	parser.add_argument("-H", "--height", dest="fig_height", metavar="HEIGHT", type=int,
						help="The image height.")

	parser.add_argument("--dpi", dest="fig_dpi", metavar="DPI", type=int, default=100,
						help="The image dpi.")

	parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False,
						help="Show the plot in interactive mode.")

	parser.add_argument("-t", "--title", dest="title", metavar="TITLE",
						help="The plot title.")

	args, logger = cmd.parse_args("perf-plot")

	try:
		width = 0.5

		if args.comparisons is None:
			raise Exception("Select the predictors to compare with -c please.")

		comparisons = []
		for c in args.comparisons:
			s = c.split("/")
			if len(s) != 2:
				raise Exception("Wrong comparison format: {}".format(c))
			comparisons += [tuple(s)]

		num_comparisons = len(comparisons)

		if not args.interactive:
			import matplotlib
			matplotlib.use('Agg')

		from matplotlib import pylab
		#pylab.rcParams['xtick.major.pad'] = '8'
		pylab.rcParams['ytick.major.pad'] = '20'

		fig = plt.figure(figsize=(args.fig_width or 12, args.fig_height or 10.4), dpi=args.fig_dpi or 100)
		fig.subplots_adjust(left=0.11, bottom=0.03, right=0.99, top=0.94, wspace=0.07, hspace=0.53)
		if args.title is not None:
			fig.suptitle(args.title)

		num_rows = len(args.weights)
		for row_idx, wpath in enumerate(args.weights, start=0):
			basename = os.path.basename(wpath)
			name = basename
			if name.endswith(".json"):
				name = name[:-5]
			if name.endswith("-weights"):
				name = name[:-8]

			i = name.find("__")
			if i > 0:
				name1, name2 = name.split("__")[:2]
			else:
				name1 = name2 = name

			logger.info("{} ...".format(name))

			with open(wpath) as f:
				state = json.load(f)

			metrics = state["metrics"]

			x = np.arange(0, 100, 10)
			y = np.arange(2)

			j = 0
			for p1, p2 in comparisons:

				if p1 not in metrics:
					raise Exception("Predictor '{}' not found in statistics file {}".format(p1, basename))

				if p2 not in metrics:
					raise Exception("Predictor '{}' not found in statistics file {}".format(p2, basename))

				stats1 = metrics[p1]
				stats2 = metrics[p2]

				mcc1 = stats1["best_perf"]["MCC"] * 100.0
				mcc2 = stats2["best_perf"]["MCC"] * 100.0

				row_title = "{}\n{}".format(name1, name2)
				col_title = p1 if row_idx == 0 else ""

				ax = fig.add_subplot(num_rows, num_comparisons, row_idx * num_comparisons + j + 1, title=col_title)
				ax.set_xticks(x)
				ax.set_xlim(0, 100)
				ax.xaxis.grid(True)
				ax.set_yticks([0.25, 0.75])
				if j == 0:
					ax.set_ylabel(row_title, rotation=90)
					ax.set_yticklabels(["Orig", "TFIC"])
				else:
					ax.set_yticklabels(["", ""])
				b1 = ax.barh([0.0], [mcc1], width, color="r")
				b2 = ax.barh([0.5], [mcc2], width, color="y")

				j += 1

			#ax.set_xticklabels(tuple([p1 for p1, p2 in comparisons]))
			#ax.legend((b1[0], b2[0]), ("Original score", "TransFIC score"))

		if args.out_path is not None:
			plt.savefig(args.out_path, bbox_inches=0)

		if args.interactive:
			plt.show()
	except:
		cmd.handle_error()

	return 0