Пример #1
0
def datasets(projects_set):
	log = task.logger
	conf = task.conf

	classifier, projects = projects_set

	classifier_id = classifier["id"]

	group_values = classifier["group_values"]
	short_values = classifier["group_short_values"]
	long_values = classifier["group_long_values"]

	group_name = classifier["group_name"]
	group_short_name = classifier["group_short_name"]
	group_long_name = classifier["group_long_name"]

	group_file_prefix = normalize_id(classifier_id)

	log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(classifier["name"], group_long_name, group_short_name, group_name, "-" * 30))

	log.info("Reading number of samples per project ...")

	project_ids = []
	total_samples = 0
	for project in projects:
		project_id = project["id"]
		project_ids += [project_id]

		log.info("  Project {0}".format(project["id"]))

		projdb = ProjectDb(project["db"])

		num_samples = projdb.get_total_affected_samples()
		total_samples += num_samples

		log.debug("    {0} samples".format(num_samples))

		projdb.close()

	log.debug("  {0} samples in total".format(total_samples))

	log.info("Updating ...")

	combination_path = get_combination_path(conf)

	path = os.path.join(combination_path, "{0}.tsv".format(group_file_prefix))

	if not os.path.exists(path):
		with open(path, "w") as f:
			tsv.write_line(f, "NAME", "SHORT_NAME", "LONG_NAME", "SAMPLES_TOTAL", "PROJECT_IDS")

	with open(path, "a") as f:
		tsv.write_line(f, group_name, group_short_name, group_long_name, total_samples, ",".join(project_ids))
Пример #2
0
def combination_recurrences(projects_set):
	log = task.logger
	conf = task.conf

	classifier, projects = projects_set

	classifier_id = classifier["id"]

	group_values = classifier["group_values"]
	short_values = classifier["group_short_values"]
	long_values = classifier["group_long_values"]

	group_name = classifier["group_name"]
	group_short_name = classifier["group_short_name"]
	group_long_name = classifier["group_long_name"]

	if len(group_values) == 0:
		group_file_prefix = classifier_id
	else:
		group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name)

	group_file_prefix = normalize_id(group_file_prefix)

	log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(classifier["name"], group_long_name, group_short_name, group_name, "-" * 30))

	log.info("Creating database ...")

	db_path = make_temp_file(task, suffix="-{0}.db".format(group_file_prefix))
	log.debug("  > {0}".format(db_path))

	conn = sqlite3.connect(db_path)
	conn.row_factory = sqlite3.Row

	create_db(conn)

	log.info("Combining recurrences ...")

	c = conn.cursor()

	sample_total = 0

	project_ids = []
	for project in projects:
		project_ids += [project["id"]]

		log.info("  Project {0}:".format(project["id"]))

		projdb = ProjectDb(project["db"])

		project_sample_total = projdb.get_total_affected_samples()

		sample_total += project_sample_total

		log.info("    Total samples = {0}".format(project_sample_total))

		log.info("    Variant genes ...")

		count = 0
		for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True):
			var = afg.var
			rec = afg.rec

			if rec.sample_freq is None:
				log.warn("Discarding variant gene without sample frequency: {0}".format(repr(afg)))
				continue

			start, end, ref, alt = var_to_tab(var)

			try:
				c.execute("INSERT INTO variants (chr, strand, start, ref, alt, xrefs) VALUES (?,?,?,?,?,?)",
						  (var.chr, var.strand, start, ref, alt, ",".join(var.xrefs)))
				var_id = c.lastrowid
			except sqlite3.IntegrityError:
				c.execute("SELECT var_id FROM variants WHERE chr=? AND strand=? AND start=? AND ref=? AND alt=?",
						  (var.chr, var.strand, start, ref, alt))
				r = c.fetchone()
				var_id = r[0]

			try:
				c.execute("INSERT INTO variant_genes (var_id, gene_id, impact, coding_region, prot_changes, sample_freq) VALUES (?,?,?,?,?,?)",
						  (var_id, afg.gene_id, afg.impact, afg.coding_region, afg.prot_changes, rec.sample_freq))
			except sqlite3.IntegrityError:
				c.execute("""
					UPDATE variant_genes
					SET sample_freq=sample_freq + ?
					WHERE var_id=? AND gene_id=?""",
						(rec.sample_freq, var_id, afg.gene_id))

			count += 1

		log.info("      {0} variant genes".format(count))

		log.info("    Genes ...")

		count = 0
		for gene in projdb.genes(join_xrefs=True, join_rec=True):
			rec = gene.rec

			if rec.sample_freq is None:
				continue

			c.execute("SELECT COUNT(*) FROM genes WHERE gene_id=?", (gene.id,))
			r = c.fetchone()
			if r[0] == 0:
				c.execute("INSERT INTO genes (gene_id, sample_freq) VALUES (?,?)",
					  (gene.id, rec.sample_freq))
			else:
				c.execute("UPDATE genes SET sample_freq=sample_freq + ? WHERE gene_id=?",
						  (rec.sample_freq, gene.id))
			count += 1

		log.info("      {0} genes".format(count))

		log.info("    Pathways ...")

		count = 0
		for pathway in projdb.pathways(join_rec=True):
			rec = pathway.rec

			if rec.sample_freq is None:
				continue

			c.execute("SELECT COUNT(*) FROM pathways WHERE pathway_id=?", (pathway.id,))
			r = c.fetchone()
			if r[0] == 0:
				c.execute("INSERT INTO pathways (pathway_id, sample_freq) VALUES (?,?)",
						  (pathway.id, rec.sample_freq))
			else:
				c.execute("UPDATE pathways SET sample_freq=sample_freq + ? WHERE pathway_id=?",
						  (rec.sample_freq, pathway.id))
			count += 1

		log.info("      {0} pathways".format(count))

		projdb.close()

	log.info("Calculating proportions with {0} samples in total among projects ...".format(sample_total))

	if sample_total > 0:
		c.execute("UPDATE variant_genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))
		c.execute("UPDATE genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))
		c.execute("UPDATE pathways SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))

	c.close()
	conn.commit()
	
	log.info("Saving results ...")
	
	c = conn.cursor()

	base_path = get_combination_path(conf, "recurrences")

	log.info("  Variant genes ...")

	with tsv.open(os.path.join(base_path, "variant_gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
		tsv.write_param(f, "classifier", classifier["id"])
		tsv.write_param(f, "group_id", group_name)
		tsv.write_param(f, "group_short_name", group_short_name)
		tsv.write_param(f, "group_long_name", group_long_name)
		tsv.write_param(f, "projects", ",".join(project_ids))
		tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
		tsv.write_line(f, "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "PROT_CHANGES", "XREFS")
		for r in c.execute("SELECT * FROM variant_genes JOIN variants USING (var_id) ORDER BY chr*1, chr, strand, start, gene_id"):
			strand, ref, alt = r["strand"], r["ref"], r["alt"]
			allele = "{0}/{1}".format(ref, alt)
			tsv.write_line(f, r["chr"], strand, r["start"], allele,
						   r["gene_id"], r["impact"], TransFIC.class_name(r["impact"]),
						   r["sample_freq"], r["sample_prop"], r["prot_changes"], r["xrefs"], null_value="-")
			
	log.info("  Genes ...")

	with tsv.open(os.path.join(base_path, "gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
		tsv.write_param(f, "classifier", classifier["id"])
		tsv.write_param(f, "group_id", group_name)
		tsv.write_param(f, "group_short_name", group_short_name)
		tsv.write_param(f, "group_long_name", group_long_name)
		tsv.write_param(f, "projects", ",".join(project_ids))
		tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
		tsv.write_line(f, "GENE_ID", "SAMPLE_FREQ", "SAMPLE_PROP")
		for r in c.execute("SELECT * FROM genes ORDER BY gene_id"):
			tsv.write_line(f, r["gene_id"], r["sample_freq"], r["sample_prop"], null_value="-")

	log.info("  Pathways ...")

	with tsv.open(os.path.join(base_path, "pathway-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
		tsv.write_param(f, "classifier", classifier["id"])
		tsv.write_param(f, "group_id", group_name)
		tsv.write_param(f, "group_short_name", group_short_name)
		tsv.write_param(f, "group_long_name", group_long_name)
		tsv.write_param(f, "projects", ",".join(project_ids))
		tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
		tsv.write_line(f, "PATHWAY_ID", "SAMPLE_FREQ", "SAMPLE_PROP")
		for r in c.execute("SELECT * FROM pathways ORDER BY pathway_id"):
			tsv.write_line(f, r["pathway_id"], r["sample_freq"], r["sample_prop"], null_value="-")
			
	conn.close()

	remove_temp(task, db_path)
Пример #3
0
def combination_oncodrivefm(projects_set):
	log = task.logger
	conf = task.conf

	classifier, projects = projects_set

	classifier_id = classifier["id"]

	group_values = classifier["group_values"]
	short_values = classifier["group_short_values"]
	long_values = classifier["group_long_values"]

	group_name = classifier["group_name"]
	group_short_name = classifier["group_short_name"]
	group_long_name = classifier["group_long_name"]

	if len(group_values) == 0:
		group_file_prefix = classifier_id
	else:
		group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name)

	group_file_prefix = normalize_id(group_file_prefix)

	log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(classifier["name"], group_long_name, group_short_name, group_name, "-" * 30))

	log.info("Exporting project data ...")

	base_path = make_temp_dir(task, suffix=".{0}".format(group_file_prefix))

	log.debug("> {0}".format(base_path))

	project_ids = []
	gene_files = []
	pathway_files = []
	for project in projects:
		project_id = project["id"]
		project_ids += [project_id]

		log.info("  Project {0}:".format(project["id"]))

		projdb = ProjectDb(project["db"])

		log.info("    Genes ...")

		count = 0
		file_path = os.path.join(base_path, "{0}-genes.tsv".format(project_id))
		gene_files += [file_path]
		with open(file_path, "w") as f:
			tsv.write_param(f, "classifier", classifier_id)
			tsv.write_param(f, "group_id", group_name)
			tsv.write_param(f, "slice", project_id)
			tsv.write_line(f, "GENE_ID", "PVALUE")
			for gene in projdb.genes():
				if gene.fm_pvalue is not None:
					tsv.write_line(f, gene.id, gene.fm_pvalue, null_value="-")
					count += 1

		log.info("      {0} genes".format(count))

		log.info("    Pathways ...")

		count = 0
		file_path = os.path.join(base_path, "{0}-pathways.tsv".format(project_id))
		pathway_files += [file_path]
		with open(file_path, "w") as f:
			tsv.write_param(f, "classifier", classifier_id)
			tsv.write_param(f, "group_id", group_name)
			tsv.write_param(f, "slice", project_id)
			tsv.write_line(f, "PATHWAY_ID", "ZSCORE")
			for pathway in projdb.pathways():
				if pathway.fm_zscore is not None:
					tsv.write_line(f, pathway.id, pathway.fm_zscore, null_value="-")
					count += 1

		log.info("      {0} pathways".format(count))

		projdb.close()

	log.info("Combining ...")

	combination_path = get_combination_path(conf, "oncodrivefm")

	log.info("  Genes ...")

	cmd = " ".join([
			"oncodrivefm-combine",
			"-m median-empirical",
			"-o '{0}'".format(combination_path),
			"-n 'gene-{0}'".format(group_file_prefix),
			"-D 'classifier={0}'".format(classifier_id),
			"-D 'group_id={0}'".format(group_name),
			"-D 'group_short_name={0}'".format(group_short_name),
			"-D 'group_long_name={0}'".format(group_long_name),
			"--output-format tsv.gz"
	] + ["'{0}'".format(name) for name in gene_files])

	log.debug(cmd)

	ret_code = subprocess.call(cmd, shell=True)
	if ret_code != 0:
		#log.error("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd))
		#return -1
		raise Exception("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd))

	log.info("  Pathways ...")

	cmd = " ".join([
			"oncodrivefm-combine",
			"-m median-zscore",
			"-o '{0}'".format(combination_path),
			"-n 'pathway-{0}'".format(group_file_prefix),
			"-D 'classifier={0}'".format(classifier_id),
			"-D 'group_id={0}'".format(group_name),
			"-D 'group_short_name={0}'".format(group_short_name),
			"-D 'group_long_name={0}'".format(group_long_name),
			"--output-format tsv.gz"
	] + ["'{0}'".format(name) for name in pathway_files])

	log.debug(cmd)

	ret_code = subprocess.call(cmd, shell=True)
	if ret_code != 0:
		#log.error("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd))
		#return -1
		raise Exception("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd))

	remove_temp(task, base_path)
Пример #4
0
def drivers():
	log = task.logger
	conf = task.conf

	db_path = get_results_path(conf, "drivers.db")
	db = SigDb(db_path)
	db.open()

	log.info("Variants ...")

	path = get_combination_path(conf, "recurrences", "variant_gene-global-all.tsv.gz")
	with tsv.open(path, "r") as f:
		types = (str, str, int, str)
		for fields in tsv.lines(f, types, columns=("CHR", "STRAND", "START", "ALLELE"), header=True):
			chr, strand, start, allele = fields[:4]
			db.add_variant(chr, start)

	log.info("Genes ...")

	gene_sites = {}

	gene_fm = set()
	gene_clust = set()

	#SPECIAL_THRESHOLD = ["C18", "C34"]
	SPECIAL_THRESHOLD = []

	log.info("  OncodriveFM ...")

	filename_re = re.compile(r"gene-cancer_site-(.+)\.tsv.gz")
	base_path = get_combination_path(conf, "oncodrivefm")
	for path in os.listdir(base_path):
		m = filename_re.match(path)
		if not m:
			continue

		cancer_site_code = m.group(1)

		if cancer_site_code in SPECIAL_THRESHOLD:
			threshold = 1e-6
		else:
			threshold = 0.01

		with tsv.open(os.path.join(base_path, path), "r") as f:
			params = tsv.params(f)
			cancer_site_name = params["group_long_name"]
			for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True):
				gene, qvalue = fields
				if qvalue < threshold:
					add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name)

					gene_fm.add(gene)

	log.info("  OncodriveCLUST ...")

	filename_re = re.compile(r"cancer_site-(.+)\.tsv.gz")
	base_path = get_combination_path(conf, "oncodriveclust")
	for path in os.listdir(base_path):
		m = filename_re.match(path)
		if not m:
			continue

		cancer_site_code = m.group(1)

		with tsv.open(os.path.join(base_path, path), "r") as f:
			params = tsv.params(f)
			cancer_site_name = params["group_long_name"]
			for fields in tsv.lines(f, (str, float), columns=("ID", "QVALUE"), header=True):
				gene, qvalue = fields
				if qvalue < 0.05:
					add_cancer_site(gene_sites, gene, cancer_site_code, cancer_site_name)

					gene_clust.add(gene)

	log.info("  Updating db ...")
	sig_genes = gene_fm | gene_clust
	for gene in sig_genes:
		db.add_gene(gene, gene in gene_fm, gene in gene_clust)

	log.info("Saving driver genes cancer sites dataset ...")
	path = get_results_path(conf, "gene-driver_cancer_sites.tsv")
	log.debug("> {}".format(path))
	with open(path, "w") as f:
		tsv.write_param(f, "date", datetime.now())
		tsv.write_line(f, "GENE_ID", "FM", "CLUST", "CANCER_SITES_COUNT", "CANCER_SITE_CODES", "CANCER_SITE_NAMES")
		for gene, sites in gene_sites.items():
			tsv.write_line(f, gene,
						   1 if gene in gene_fm else 0,
						   1 if gene in gene_clust else 0,
						   len(sites),
						   ", ".join(sorted([code for code, name in sites])),
						   ", ".join(sorted([name for code, name in sites])))

	db.commit()
	db.close()