예제 #1
0
def main():
	parser = argparse.ArgumentParser(
		description="Plot training sets statistics")

	parser.add_argument("path", metavar="PATH",
						help="The statistics json file")

	parser.add_argument("-o", dest="out_path", metavar="PATH",
						help="The path to save the plot image.")

	parser.add_argument("-W", "--width", dest="fig_width", metavar="WIDTH", type=int,
						help="The image width.")

	parser.add_argument("-H", "--height", dest="fig_height", metavar="HEIGHT", type=int,
						help="The image height.")

	parser.add_argument("--dpi", dest="fig_dpi", metavar="DPI", type=int, default=100,
						help="The image dpi.")

	parser.add_argument("-p", "--predictors", dest="predictor_names", metavar="NAMES",
						help="The names of the predictors to represent (seppareted by commas).")

	parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False,
						help="Show the plot in interactive mode.")

	bglogging.add_logging_arguments(parser)

	args = parser.parse_args()

	bglogging.initialize(args)

	log = bglogging.get_logger("plot-stats")

	log.info("Loading state from {} ...".format(os.path.basename(args.path)))

	state = load_weights(args.path)

	predictor_names, stats = [state[k] for k in ["predictor_names", "stats"]]

	if args.predictor_names is not None:
		valid_names = set(predictor_names)
		args.predictor_names = [s.strip() for s in args.predictor_names.split(",")]
		predictor_names = [name for name in args.predictor_names if name in valid_names]

		if len(predictor_names) == 0:
			log.error("No scores selected. Please choose between: {}".format(", ".join(valid_names)))
			exit(-1)

	#log.info("Plotting ...")

	fig = plt.figure(figsize=(args.fig_width or 12, args.fig_height or 10.4), dpi=args.fig_dpi or 100)

	alpha = 0.7

	num_predictors = len(predictor_names)
	for i in range(num_predictors):
		predictor_name = predictor_names[i]

		predictor_stats = stats[predictor_name]
		(intervals, dp, dn, cump, cumn, cdp, cdn, tp, tn, fp, fn, mcc, accuracy, cutoff) = [
			predictor_stats[k] for k in [
				"values", "dp", "dn", "cump", "cumn", "cdp", "cdn", "tp", "tn", "fp", "fn", "mcc", "acc", "cutoff"]]

		dax = fig.add_subplot(4, num_predictors, i + 1, title="{}".format(predictor_name))
		cdax = fig.add_subplot(4, num_predictors, 1 * num_predictors + i + 1)
		tfax = fig.add_subplot(4, num_predictors, 2 * num_predictors + i + 1)
		aax = fig.add_subplot(4, num_predictors, 3 * num_predictors + i + 1)

		# distribution
		dax.grid()
		dax.plot(intervals, arrdiv(dp, max(dp)), "r-", alpha=alpha)
		dax.plot(intervals, arrdiv(dn, max(dn)), "b-", alpha=alpha)
		dax.plot([cutoff, cutoff], [0.0, 1.0], "k--")
		dax.legend(('POS', 'NEG'), 'upper center', ncol=2, frameon=False, prop={'size':10})

		# cummulative distribution
		cdax.grid()
		cdax.plot(intervals, arrdiv(cdp, cump), "r-", alpha=alpha)
		cdax.plot(intervals, arrdiv(cdn, cumn), "b-", alpha=alpha)
		cdax.plot([cutoff, cutoff], [0.0, 1.0], "k--")
		cdax.legend(('POS', 'NEG'), 'upper center', ncol=2, frameon=False, prop={'size':10})

		# TP/FN/FP/TN
		tfax.grid()
		tfax.plot(intervals, arrdiv(tp, cump), "r-", alpha=alpha)
		tfax.plot(intervals, arrdiv(fn, cump), "c--", alpha=alpha)
		tfax.plot(intervals, arrdiv(fp, cumn), "b-", alpha=alpha)
		tfax.plot(intervals, arrdiv(tn, cumn), "m--", alpha=alpha)
		tfax.plot([cutoff, cutoff], [0.0, 1.0], "k--")
		tfax.legend(('TP', 'FN', 'FP', 'TN'), 'upper center', ncol=4, frameon=False, prop={'size':8})

		# MCC/Accuracy
		aax.grid()
		aax.plot(intervals, mcc, "g-", alpha=alpha)
		aax.plot(intervals, accuracy, "y-", alpha=alpha)
		aax.plot([cutoff, cutoff], [0.0, 1.0], "k--")
		aax.legend(('MCC', 'Accuracy'), 'upper center', ncol=2, frameon=False, prop={'size':10})

	if args.out_path is not None:
		from matplotlib import pylab
		log.info("Saving image into {} ...".format(os.path.basename(args.out_path)))
		pylab.savefig(args.out_path, bbox_inches=0)

	if args.interactive:
		plt.show()
예제 #2
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate TransFIC labels")

	cmd = Command.withtraits(DbTrait, PredictorsInDbTrait)(parser)
	
	cmd.add_db_args()
	
	parser.add_argument("cutoffs_path", metavar="CUTOFFS",
						help="File containing the cutoffs")

	cmd.add_selected_predictors_args()

	parser.add_argument("-u", "--updated-predictors", dest="updated_predictors", metavar="NAMES",
						help="Updated predictor names")

	args, logger = cmd.parse_args("calc-label")

	db = cmd.open_db()

	try:
		logger.info("Loading state ...")

		state = load_weights(args.cutoffs_path)

		avail_predictors, stats = [state[k] for k in ["predictors", "stats"]]

		predictors = cmd.get_selected_predictors(default_all=True)
		missing_predictors = [p for p in predictors if p not in set(avail_predictors)]
		if len(missing_predictors) > 0:
			raise Exception("Missing cutoff stats for predictors: {}".format(", ".join(missing_predictors)))

		if args.updated_predictors is not None:
			if len(predictors) != len(args.updated_predictors):
				raise Exception("The number of selected predictors does not match the number of predictor names to update")
			updated_predictors = dict([(p, u) for p, u in zip(predictors, args.updated_predictors)])
		else:
			updated_predictors = dict([(p, "{}_LABEL".format(p)) for p in predictors])

		# create predictors in the database if required

		db_predictors = set([p["id"] for p in db.predictors()])
		for predictor, updated_predictor in updated_predictors.items():
			if updated_predictor not in db_predictors:
				logger.info("Creating predictor {} ...".format(updated_predictor))
				db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor])

		cutoffs = {}
		for predictor in predictors:
			cutoff_low_mid, cutoff_mid_high = [stats[predictor][v] for v in ["cutoff_low_mid", "cutoff_mid_high"]]
			logger.info("{}: cutoffs: low_mid={}, mid_high={}".format(predictor, cutoff_low_mid, cutoff_mid_high))
			cutoffs[predictor] = (cutoff_low_mid, cutoff_mid_high)

		logger.info("Calculating ...")

		progress = RatedProgress(logger, name="SNVs")

		for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1):
			scores = row["scores"]
			uscores = {}
			for predictor in predictors:
				score = scores[predictor]
				if score is None:
					continue

				cutoff_low_mid, cutoff_mid_high = cutoffs[predictor]
				updated_predictor = updated_predictors[predictor]

				uscores[updated_predictor] = 0.0 if score < cutoff_low_mid else 1.0 if score < cutoff_mid_high else 2.0

			if len(uscores) > 0:
				db.update_scores(row["id"], uscores)

			progress.update()

		db.commit()
	except:
		cmd.handle_error()

	return 0
예제 #3
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate Condel label")

	parser.add_argument("db_path", metavar="DB_PATH",
						help="Functional scores database")

	parser.add_argument("weights_path", metavar="WEIGHTS",
						help="File containing the scores weights and cutoffs")

	parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS",
						help="Comma separated list of predictors")

	parser.add_argument("-u", "--updated-predictors", dest="updated_predictors", metavar="NAMES",
						help="Updated predictor names")

	bglogging.add_logging_arguments(parser)

	args = parser.parse_args()

	bglogging.initialize(args)

	log = bglogging.get_logger("calculate-label")

	log.info("Opening functional scores database ...")

	db = FannsSQLiteDb(args.db_path)
	db.open()

	log.info("Loading state ...")

	state = load_weights(args.weights_path)

	avail_predictors, precision, step, stats = [state[k] for k in ["predictor_names", "precision", "step", "stats"]]
	if args.predictors is not None:
		predictors = [p for p in [p.strip() for p in args.predictors.split(",")] if p in avail_predictors]
		if len(predictors) == 0:
			log.error("Unknown predictors: {}".format(args.predictors))
			log.error("Available predictor names are: {}".format(", ".join(avail_predictors)))
			exit(-1)
	else:
		predictors = avail_predictors

	if args.updated_predictors is not None:
		updated_predictors = [p.strip() for p in args.updated_predictors.split(",")]
		if len(predictors) != len(updated_predictors):
			log.error("Number of updated predictors does not match with the list of number of predictors")
			exit(-1)
	else:
		updated_predictors = ["{}_CLASS".format(p.upper()) for p in predictors]

	log.info("Available predictors: {}".format(", ".join(avail_predictors)))
	log.info("Selected predictors: {}".format(", ".join(predictors)))

	for predictor, updated_predictor in zip(predictors, updated_predictors):
		log.info("Creating predictor {} ...".format(updated_predictor))
		db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor])

	cutoffs = []
	for predictor in predictors:
		cutoff, mcc, acc = [stats[predictor][v] for v in ["cutoff", "cutoff_mcc", "cutoff_acc"]]
		log.info("{}: cutoff={}, MCC={}, accuracy={}".format(predictor, cutoff, mcc, acc))
		cutoffs += [cutoff]

	log.info("Calculating ...")

	start_time = partial_start_time = time.time()
	try:
		for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1):
			scores = row["scores"]
			d = {}
			for i, predictor in enumerate(predictors):
				score = scores[predictor]
				if score is None:
					continue

				cutoff = cutoffs[i]
				updated_predictor = updated_predictors[i]

				d[updated_predictor] = 0.0 if score < cutoff else 1.0

			db.update_scores(row["id"], d)

			partial_time = time.time() - partial_start_time
			if partial_time > 5.0:
				partial_start_time = time.time()
				elapsed_time = time.time() - start_time
				log.debug("  {} rows, {:.1f} rows/second".format(hsize(num_rows), num_rows / elapsed_time))

		db.commit()
	except KeyboardInterrupt:
		log.warn("Interrupted by Ctrl-C")
		db.rollback()
	except:
		db.rollback()
		raise
	finally:
		db.close()
예제 #4
0
def main():
	parser = argparse.ArgumentParser(
		description="Plot training sets statistics")

	parser.add_argument("path", metavar="PATH",
						help="The statistics json file")

	parser.add_argument("-o", dest="out_path", metavar="PATH",
						help="The path to save the plot image.")

	parser.add_argument("-p", "--predictors", dest="predictor_names", metavar="NAMES",
						help="The names of the predictors to represent (seppareted by commas).")

	parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False,
						help="Show the plot in interactive mode.")

	bglogging.add_logging_arguments(parser)

	args = parser.parse_args()

	bglogging.initialize(args)

	log = bglogging.get_logger("plot-stats")

	log.info("Loading state ...")

	state = load_weights(args.path)

	predictor_names, stats = [state[k] for k in ["predictor_names", "stats"]]

	if args.predictor_names is not None:
		valid_names = set(predictor_names)
		args.predictor_names = [s.strip() for s in args.predictor_names.split(",")]
		predictor_names = [name for name in args.predictor_names if name in valid_names]

		if len(predictor_names) == 0:
			log.error("No scores selected. Please choose between: {}".format(", ".join(valid_names)))
			exit(-1)

	log.info("Plotting ...")

	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.grid()
	ax.set_xlabel("False Positive Rate (1 - especificity)")
	ax.set_ylabel("True Positive Rate (sensitivity)")
	
	num_predictors = len(predictor_names)
	for predictor_name in predictor_names:
		predictor_stats = stats[predictor_name]
		(size, tp, tn, fp, fn) = [predictor_stats[k] for k in ["size", "tp", "tn", "fp", "fn"]]
		
		tpr = [1.0] * (size + 1)
		fpr = [1.0] * (size + 1)
		for i in range(size):
			tpr[i + 1] = (float(tp[i]) / (tp[i] + fn[i]))
			fpr[i + 1] = (float(fp[i]) / (fp[i] + tn[i]))

		ax.plot(fpr, tpr, "-")
	
	ax.legend(tuple(predictor_names), "lower right", shadow=False)

	ax.plot([0.0, 1.0], [0.0, 1.0], "--", color="0.75")

	if args.out_path is not None:
		from matplotlib import pylab
		pylab.savefig(args.out_path, bbox_inches=0)

	if args.interactive:
		plt.show()
예제 #5
0
def main():
	parser = argparse.ArgumentParser(
		description="Calculate Condel score")

	parser.add_argument("db_path", metavar="DB_PATH",
						help="Functional scores database")

	parser.add_argument("weights_path", metavar="WEIGHTS",
						help="File containing the scores weights and cutoffs")

	parser.add_argument("-p", "--predictors", dest="predictors", metavar="PREDICTORS",
						help="Comma separated list of predictors")

	parser.add_argument("-u", "--updated-predictor", dest="updated_predictor", metavar="NAME",
						help="Updated predictor name")

	bglogging.add_logging_arguments(parser)

	args = parser.parse_args()

	bglogging.initialize(args)

	log = bglogging.get_logger("calculate")

	log.info("Opening functional scores database ...")

	db = FannsSQLiteDb(args.db_path)
	db.open()

	updated_predictor = args.updated_predictor or "CONDEL"

	predictors = set([p["id"] for p in db.predictors()])
	if updated_predictor not in predictors:
		log.info("  Creating predictor {} ...".format(updated_predictor))
		db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=predictors)

	log.info("Loading state ...")

	state = load_weights(args.weights_path)

	avail_predictors, precision, step, stats = [state[k] for k in ["predictor_names", "precision", "step", "stats"]]
	if args.predictors is not None:
		predictors = [p for p in [p.strip() for p in args.predictors.split(",")] if p in avail_predictors]
		if len(predictors) == 0:
			log.error("Unknown predictors: {}".format(args.predictors))
			log.error("Available predictor names are: {}".format(", ".join(avail_predictors)))
			exit(-1)
	else:
		predictors = avail_predictors

	log.info("Available predictors: {}".format(", ".join(avail_predictors)))
	log.info("Selected predictors: {}".format(", ".join(predictors)))

	log.info("Calculating ...")

	start_time = partial_start_time = time.time()
	try:
		for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1):
			scores = row["scores"]
			condel = wsum = 0
			for predictor, score in scores.items():
				if score is None:
					continue

				predictor_stats = stats[predictor]
				rmin, rmax, dim, size, cdp, cdn, cutoff = [predictor_stats[k] for k in [
																"rmin", "rmax", "dim", "size", "cdp", "cdn", "cutoff"]]

				if predictor in PREDICTOR_TRANSFORM:
					score = PREDICTOR_TRANSFORM[predictor](score)

				r = (score - rmin) / dim
				index = int(r * size) if score < rmax else size - 1

				if score < cutoff:
					w = 1 - cdn[index]
				else:
					w = 1 - cdp[index]

				wsum += w
				condel += w * score

				#log.info("{}={}, w={} -> {}".format(predictor_name, score, w, score * w))

			if wsum != 0:
				condel /= wsum

				d = {updated_predictor : condel}
				db.update_scores(row["id"], d)

				#log.info(">>> CONDEL={}".format(condel))
			else:
				log.warn("wsum = 0, condel={}, scores={}".format(condel, repr(scores)))

			partial_time = time.time() - partial_start_time
			if partial_time > 5.0:
				partial_start_time = time.time()
				elapsed_time = time.time() - start_time
				log.debug("  {} rows, {:.1f} rows/second".format(hsize(num_rows), num_rows / elapsed_time))

		log.info("Commit ...")
		db.commit()
	except KeyboardInterrupt:
		log.warn("Interrupted by Ctrl-C")
		db.rollback()
	except:
		db.rollback()
		raise
	finally:
		db.close()
예제 #6
0
def main():
	parser = argparse.ArgumentParser(
		description="Plot cutoffs")

	cmd = DefaultCommandHelper(parser)

	parser.add_argument("path", metavar="PATH",
						help="The statistics json file")

	parser.add_argument("-o", dest="out_path", metavar="PATH",
						help="The path to save the plot image.")

	cmd.add_selected_predictors_args()

	parser.add_argument("-i", "--interactive", dest="interactive", action="store_true", default=False,
						help="Show the plot in interactive mode.")

	args, logger = cmd.parse_args("plot-cutoffs")

	logger.info("Loading state ...")

	state = load_weights(args.path)

	available_predictors, stats = [state[k] for k in ["predictors", "stats"]]

	predictors = cmd.get_selected_predictors(available_predictors)

	logger.info("Plotting ...")

	fig = plt.figure(figsize=(12.4, 10.5), dpi=100)
	fig.subplots_adjust(left=0.06, bottom=0.03, right=0.99, top=0.96, wspace=0.22, hspace=0.15)

	num_predictors = len(predictors)
	for i, predictor in enumerate(predictors):

		predictor_stats = stats[predictor]
		(intervals, vmin, vmax, dp, dn, cump, cumn, cdp, cdn, cutoff_low_mid, cutoff_mid_high) = [predictor_stats[k] for k in [
			"values", "vmin", "vmax", "dp", "dn", "cump", "cumn", "cdp", "cdn", "cutoff_low_mid", "cutoff_mid_high"]]

		dax = fig.add_subplot(2, num_predictors, i + 1, title="{}".format(predictor))
		cdax = fig.add_subplot(2, num_predictors, 1 * num_predictors + i + 1)

		dax.grid()
		dax.set_xlim(vmin, vmax)
		dax.plot(intervals, dp, "r-", alpha=0.5)
		dax.plot(intervals, dn, "b-", alpha=0.5)
		dax.plot([cutoff_low_mid, cutoff_low_mid], [0.0, max(dp + dn)], "k--")
		dax.plot([cutoff_mid_high, cutoff_mid_high], [0.0, max(dp + dn)], "k--")
		dax.axvspan(vmin, cutoff_low_mid, facecolor='g', alpha=0.3)
		dax.axvspan(cutoff_low_mid, cutoff_mid_high, facecolor='y', alpha=0.3)
		dax.axvspan(cutoff_mid_high, vmax, facecolor='r', alpha=0.3)
		dax.legend(('HIGH-REC', 'NON-REC'), 'upper center', ncol=2, frameon=False, prop={'size':10})

		cdax.grid()
		cdax.set_xlim(vmin, vmax)
		cdax.plot(intervals, [v / float(cump) for v in cdp], "r-")
		cdax.plot(intervals, [v / float(cumn) for v in cdn], "b-")
		cdax.plot([cutoff_low_mid, cutoff_low_mid], [0.0, 1.0], "k--")
		cdax.plot([cutoff_mid_high, cutoff_mid_high], [0.0, 1.0], "k--")
		cdax.axvspan(vmin, cutoff_low_mid, facecolor='g', alpha=0.3)
		cdax.axvspan(cutoff_low_mid, cutoff_mid_high, facecolor='y', alpha=0.3)
		cdax.axvspan(cutoff_mid_high, vmax, facecolor='r', alpha=0.3)
		cdax.legend(('HIGH-REC', 'NON-REC'), 'upper center', ncol=2, frameon=False, prop={'size':10})

	if args.out_path is not None:
		from matplotlib import pylab
		pylab.savefig(args.out_path, bbox_inches=0)

	if args.interactive:
		plt.show()