Пример #1
0
def ma_run(partition):
	log = task.logger
	conf = task.conf

	results_port = task.ports("results")

	project = partition["project"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"]))

	offline = conf["offline"]

	if offline == "yes":
		log.info("Running Mutation assessor in local mode.")

		ma = MaLocal(conf["ma_cache_path"])
	else:
		log.info("Running Mutation assessor using web services.")

		ma = MaService(project["assembly"], cache_path=os.path.join(conf["cache_path"], "ma.db"))

	results_path = os.path.join(partition["base_path"], "{0:08d}.ma".format(partition["index"]))

	if not os.path.exists(results_path) or conf.get("consequences_overwrite", True):

		log.info("Querying Mutation assessor for 'missense_variant' consequences ...")

		projdb = ProjectDb(project["db"])

		missense_variants = set()

		with open(partition["vep_path"], "r") as f:
			for line in f:
				fields = line.rstrip().split("\t")
				var_id = int(fields[0])
				ctypes = fields[3].split(",")
				if so.match(ctypes, so.NON_SYNONYMOUS):
					missense_variants.add(var_id)

		with open(results_path, "w") as mf:
			for var_id in missense_variants:
				var = projdb.get_variant(var_id)

				start, end, ref, alt = var_to_tab(var)

				r = ma.get(var.chr, var.strand, start, ref, alt, var_id)
				if r is not None:
					tsv.write_line(mf, var_id, r.uniprot, r.fi_score, null_value="-")

		projdb.close()

	else:
		log.warn("Skipping MA, results already exist.")
		log.debug("MA results: {0}".format(results_path))

	ma.close()

	# Send results to the next module
	partition["ma_path"] = results_path
	results_port.send(partition)
Пример #2
0
def datasets(projects_set):
	log = task.logger
	conf = task.conf

	classifier, projects = projects_set

	classifier_id = classifier["id"]

	group_values = classifier["group_values"]
	short_values = classifier["group_short_values"]
	long_values = classifier["group_long_values"]

	group_name = classifier["group_name"]
	group_short_name = classifier["group_short_name"]
	group_long_name = classifier["group_long_name"]

	group_file_prefix = normalize_id(classifier_id)

	log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(classifier["name"], group_long_name, group_short_name, group_name, "-" * 30))

	log.info("Reading number of samples per project ...")

	project_ids = []
	total_samples = 0
	for project in projects:
		project_id = project["id"]
		project_ids += [project_id]

		log.info("  Project {0}".format(project["id"]))

		projdb = ProjectDb(project["db"])

		num_samples = projdb.get_total_affected_samples()
		total_samples += num_samples

		log.debug("    {0} samples".format(num_samples))

		projdb.close()

	log.debug("  {0} samples in total".format(total_samples))

	log.info("Updating ...")

	combination_path = get_combination_path(conf)

	path = os.path.join(combination_path, "{0}.tsv".format(group_file_prefix))

	if not os.path.exists(path):
		with open(path, "w") as f:
			tsv.write_line(f, "NAME", "SHORT_NAME", "LONG_NAME", "SAMPLES_TOTAL", "PROJECT_IDS")

	with open(path, "a") as f:
		tsv.write_line(f, group_name, group_short_name, group_long_name, total_samples, ",".join(project_ids))
Пример #3
0
def variants(project):
	log = task.logger
	conf = task.conf

	log.info("--- [{0}] --------------------------------------------".format(project["id"]))

	log.info("Calculating number of variants processed in each step ...")

	proj_res = ProjectResults(project)

	projdb = ProjectDb(project["db"])

	counts = projdb.count_variants()

	proj_res.save_quality_control("variants", counts)

	projdb.close()
Пример #4
0
def update_db(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	oclust = project["oncodriveclust"]
	del project["oncodriveclust"]

	if not os.path.exists(oclust["results"]):
		log.warn("No results have been found. Skipping it.")
		return

	log.info("Updating the project database ...")

	projdb = ProjectDb(project["db"])

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("  Excluded gene causes ...")
	log.debug("    > {0}".format(exc_path))

	count = 0
	with tsv.open(exc_path, "r") as exf:
		for gene, cause in tsv.lines(exf, (str, str), header=True):
			projdb.update_gene(Gene(id=gene, clust_exc_cause=cause))
			count += 1

	log.debug("    {0} genes excluded".format(count))

	log.info("  OncodriveCLUST results ...")

	with tsv.open(oclust["results"], "r") as f:
		types = (str, str, float, float, float)
		columns = ("GENE", "CLUST_COORDS", "ZSCORE", "PVALUE", "QVALUE")
		for gene, coords, zscore, pvalue, qvalue in tsv.lines(f, types, columns=columns, header=True, null_value="NA"):
			projdb.update_gene(Gene(id=gene, clust_coords=coords, clust_zscore=zscore, clust_pvalue=pvalue,
									clust_qvalue=qvalue, clust_exc_cause=ProjectDb.NO_GENE_EXC))

	projdb.commit()

	projdb.close()

	projects_out_port.send(project)
Пример #5
0
def oncodriveclust(project):
    log = task.logger
    conf = task.conf

    log.info("--- [{0}] --------------------------------------------".format(project["id"]))

    source_genes = {}
    syn_genes = set()
    selected_genes = set()
    filter_genes = set()
    threshold_genes = set()

    source_samples = {}
    selected_samples = set()
    filter_samples = set()
    threshold_samples = set()

    selected_gene_sample_count = {}  # number of samples for each selected gene
    filter_gene_sample_count = {}  # number of samples per each gene passing the filter

    # get configuration

    samples_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project)

    log.info("Retrieving gene alterations ...")

    projdb = ProjectDb(project["db"])

    data = set()

    for csq in projdb.consequences(join_samples=True):
        # filters={ProjectDb.CSQ_CTYPES : so.PROTEIN_AFFECTING | so.SYNONYMOUS}):

        is_selected = so.match(csq.ctypes, so.PROTEIN_AFFECTING)
        is_synonymous = so.match(csq.ctypes, so.SYNONYMOUS)

        if csq.gene not in source_genes:
            source_genes[csq.gene] = gene_index = len(source_genes)

        if is_selected:
            selected_genes.add(gene_index)

        if is_synonymous:
            syn_genes.add(gene_index)

        for sample in csq.var.samples:
            if sample.name not in source_samples:
                source_samples[sample.name] = sample_index = len(source_samples)

            if is_selected:
                selected_samples.add(sample_index)
                data.add((csq.gene, sample_index))

    projdb.close()

    log.info("Counting selected, filtered and threshold ...")

    # calculate selected and filter counts

    data2 = set()

    for gene, sample_index in data:
        gene_index = source_genes[gene]
        if gene_index not in selected_gene_sample_count:
            selected_gene_sample_count[gene_index] = 1
        else:
            selected_gene_sample_count[gene_index] += 1

        if filt.valid(gene):
            data2.add((gene_index, sample_index))
            filter_genes.add(gene_index)
            filter_samples.add(sample_index)
            if gene_index not in filter_gene_sample_count:
                filter_gene_sample_count[gene_index] = 1
            else:
                filter_gene_sample_count[gene_index] += 1

                # calculate threshold counts

    for gene_index, sample_index in data2:
        if selected_gene_sample_count[gene_index] >= samples_threshold:
            threshold_genes.add(gene_index)
            threshold_samples.add(sample_index)

    log.info("Counting significant genes ...")

    # significance of q-values

    projdb = ProjectDb(project["db"])
    sig_thresholds = [0.0, 0.001, 0.005] + [i / 100.0 for i in range(1, 11)] + [1.0]
    sig_count = [0] * len(sig_thresholds)
    for gene in projdb.genes():
        if gene.id in source_genes and source_genes[gene.id] in threshold_genes:
            i = 0
            while i < len(sig_thresholds) and gene.fm_qvalue > sig_thresholds[i]:
                i += 1

            for j in range(i, len(sig_count)):
                sig_count[j] += 1

    projdb.close()

    source_genes_count = len(source_genes)
    syn_genes_count = len(syn_genes)
    selected_genes_count = len(selected_genes)
    filter_genes_count = len(filter_genes)
    threshold_genes_count = len(threshold_genes)

    source_samples_count = len(source_samples)
    selected_samples_count = len(selected_samples)
    filter_samples_count = len(filter_samples)
    threshold_samples_count = len(threshold_samples)

    sorted_filter_genes = sorted(filter_genes, reverse=True, key=lambda gi: filter_gene_sample_count[gi])

    qc_data = dict(
        source=dict(
            genes=sorted(source_genes.keys(), key=lambda k: source_genes[k]),
            genes_count=source_genes_count,
            genes_lost_count=max(0, source_genes_count - syn_genes_count - threshold_genes_count),
            samples=sorted(source_samples.keys(), key=lambda k: source_samples[k]),
            samples_count=source_samples_count,
        ),
        samples_lost_count=max(0, source_samples_count - threshold_samples_count),
        synonymous=dict(
            genes=sorted(syn_genes),
            genes_count=syn_genes_count,
            ratio=(float(syn_genes_count) / selected_genes_count) if selected_genes_count > 0 else 0,
        ),
        selected=dict(
            genes=sorted(selected_genes),
            genes_count=selected_genes_count,
            genes_lost=sorted(set(source_genes.values()) - syn_genes - selected_genes),
            genes_lost_count=max(0, source_genes_count - syn_genes_count - selected_genes_count),
            samples=sorted(selected_samples),
            samples_count=selected_samples_count,
            samples_lost=sorted(set(source_samples.values()) - selected_samples),
            samples_lost_count=max(0, source_samples_count - selected_samples_count),
        ),
        filter=dict(
            genes=sorted_filter_genes,
            genes_count=filter_genes_count,
            genes_lost=sorted(selected_genes - filter_genes),
            genes_lost_count=max(0, selected_genes_count - filter_genes_count),
            genes_sample_count=[filter_gene_sample_count[gene_index] for gene_index in sorted_filter_genes],
            samples=sorted(filter_samples),
            samples_count=filter_samples_count,
            samples_lost=sorted(selected_samples - filter_samples),
            samples_lost_count=max(0, selected_samples_count - filter_samples_count),
        ),
        threshold=dict(
            genes=sorted(threshold_genes),
            genes_count=threshold_genes_count,
            genes_lost=sorted(filter_genes - threshold_genes),
            genes_lost_count=max(0, filter_genes_count - threshold_genes_count),
            samples=sorted(threshold_samples),
            samples_count=threshold_samples_count,
            samples_threshold=samples_threshold,
            samples_lost=sorted(filter_samples - threshold_samples),
            samples_lost_count=max(0, filter_samples_count - threshold_samples_count),
        ),
        results=dict(sig_thresholds=sig_thresholds[1:], sig_count=sig_count[1:]),
    )

    project_results = ProjectResults(project)
    project_results.save_quality_control("oncodriveclust", qc_data)
Пример #6
0
def quality_control(log, conf, project, filt):

	data = {}

	projdb = ProjectDb(project["db"])

	for csq in projdb.consequences(join_samples=True, join_ctypes=True):#,
								   #filters={ProjectDb.CSQ_CTYPES : so.ONCODRIVEFM}):
		
		is_selected = so.match(csq.ctypes, so.ONCODRIVEFM)
		
		var = csq.var
		for sample in var.samples:
			key = (sample.id, csq.gene)
			if key not in data:
				data[key] = is_selected
			else:
				data[key] = data[key] or is_selected

	projdb.close()

	source_genes = {}

	selected_genes = set()
	filter_genes = set()
	threshold_genes = set()

	selected_gene_sample_count = {} # number of samples for each selected gene
	filter_gene_sample_count = {} # number of samples per gene

	source_samples = {}
	selected_samples = set()
	filter_samples = set()
	threshold_samples = set()

	for (sample, gene), is_selected in data.items():
		if sample in source_samples:
			sample_index = source_samples[sample]
		else:
			source_samples[sample] = sample_index = len(source_samples)

		if is_selected:
			selected_samples.add(sample_index)

			increment(selected_gene_sample_count, gene)

	samples_threshold = get_threshold(log, conf, project,
									"oncodrivefm.genes.threshold", ONCODRIVEFM_GENES_THRESHOLD, len(selected_samples))

	for (sample, gene), is_selected in data.items():
		if gene not in source_genes:
			source_genes[gene] = len(source_genes)

		gi = source_genes[gene]
		sample_index = source_samples[sample]

		if is_selected:
			if filt is None or filt.valid(gene):
				filter_samples.add(sample_index)

				increment(filter_gene_sample_count, gi)

				if selected_gene_sample_count[gene] >= samples_threshold:
					threshold_samples.add(sample_index)

	for gene, sample_count in selected_gene_sample_count.items():
		gi = source_genes[gene]

		selected_genes.add(gi)

		if filt is None or filt.valid(gene):
			filter_genes.add(gi)

			if sample_count >= samples_threshold:
				threshold_genes.add(gi)

	# significance of q-values

	projdb = ProjectDb(project["db"])
	sig_thresholds = [0.0, 0.001, 0.005] + [i / 100.0 for i in range(1, 11)] + [1.0]
	sig_count = [0] * len(sig_thresholds)
	for gene in projdb.genes():
		if gene.id in source_genes and source_genes[gene.id] in threshold_genes:
			i = 0
			while i < len(sig_thresholds) and gene.fm_qvalue > sig_thresholds[i]:
				i += 1

			for j in range(i, len(sig_count)):
				sig_count[j] += 1

	projdb.close()

	source_samples_count = len(source_samples)
	selected_samples_count = len(selected_samples)
	filter_samples_count = len(filter_samples)
	threshold_samples_count = len(threshold_samples)
	
	source_genes_count = len(source_genes)
	selected_genes_count = len(selected_genes)
	filter_genes_count = len(filter_genes)
	threshold_genes_count = len(threshold_genes)

	sorted_filter_genes = sorted(filter_genes, reverse=True, key=lambda gi: filter_gene_sample_count[gi])

	qc_data = dict(
			source=dict(
				genes=sorted(source_genes.keys(), key=lambda k: source_genes[k]),
				genes_count=source_genes_count,
				genes_lost_count=max(0, source_genes_count - threshold_genes_count),
				samples=sorted(source_samples.keys(), key=lambda k: source_samples[k]),
				samples_count=source_samples_count),
				samples_lost_count=max(0, source_samples_count - threshold_samples_count),
			selected=dict(
				genes=sorted(selected_genes),
				genes_count=selected_genes_count,
				genes_lost=sorted(set(source_genes.values()) - selected_genes),
				genes_lost_count=max(0, source_genes_count - selected_genes_count),
				samples=sorted(selected_samples),
				samples_count=selected_samples_count,
				samples_lost=sorted(set(source_samples.values()) - selected_samples),
				samples_lost_count=max(0, source_samples_count - selected_samples_count)),
			filter=dict(
				genes=sorted_filter_genes,
				genes_count=filter_genes_count,
				genes_lost=sorted(selected_genes - filter_genes),
				genes_lost_count=max(0, selected_genes_count - filter_genes_count),
				genes_sample_count=[filter_gene_sample_count[gi] for gi in sorted_filter_genes],
				samples=sorted(filter_samples),
				samples_count=filter_samples_count,
				samples_lost=sorted(selected_samples - filter_samples),
				samples_lost_count=max(0, selected_samples_count - filter_samples_count)),
			threshold=dict(
				genes=sorted(threshold_genes),
				genes_count=threshold_genes_count,
				genes_lost=sorted(filter_genes - threshold_genes),
				genes_lost_count=max(0, filter_genes_count - threshold_genes_count),
				samples=sorted(threshold_samples),
				samples_count=threshold_samples_count,
				samples_threshold=samples_threshold,
				samples_lost=sorted(filter_samples - threshold_samples),
				samples_lost_count=max(0, filter_samples_count - threshold_samples_count)),
			results=dict(
				sig_thresholds=sig_thresholds[1:],
				sig_count=sig_count[1:])
			)

	return qc_data
Пример #7
0
def pack_results(project):
	log = task.logger
	conf = task.conf

	project_id = project["id"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_path = project["path"]
	temp_path = project["temp_path"]

	dest_path = os.path.join(project_path, "results.zip")

	sigdb = SigDb(conf["sigdb_path"])
	sigdb.open()

	projdb = ProjectDb(project["db"])

	projres = ProjectResults(project)

	gene_sym = projdb.get_gene_symbols()

	total_samples = projdb.get_total_affected_samples()

	log.info("Compressing files ...")

	arc = None
	try:
		arc = Archive(dest_path, mode="w", fmt="zip")

		log.info("  Variant genes ...")

		with ArcFile(task, arc, project_id, "variant_genes", "w") as vf:
			write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE",
							"GENE_ID", "SYMBOL", "VAR_IMPACT", "VAR_IMPACT_DESC",
							"SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP",
							"CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS")

			for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True):
				var = afg.var
				rec = afg.rec

				start, end, ref, alt = var_to_tab(var)

				xrefs = [xref for xref in var.xrefs]
				if sigdb.exists_variant(var.chr, start):
					xrefs += ["I:1"]
				xrefs = ",".join(xrefs)

				intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0

				write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt),
								afg.gene_id, gene_sym.get(afg.gene_id),
								afg.impact, TransFIC.class_name(afg.impact),
								rec.sample_freq or 0, total_samples, rec.sample_prop or 0,
								afg.coding_region, afg.prot_changes, intogen_driver, xrefs)

		log.info("  Variant samples ...")

		with ArcFile(task, arc, project_id, "variant_samples", "w") as vf:
			write_line(vf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLES")

			for var in projdb.variants(join_samples=True):
				start, end, ref, alt = var_to_tab(var)
				write_line(vf, project_id, var.chr, var.strand, start, "{0}/{1}".format(ref, alt),
						   ",".join([s.name for s in var.samples]))

		log.info("  Consequences ...")

		with ArcFile(task, arc, project_id, "consequences", "w") as cf:
			write_line(cf, "PROJECT_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID", "CT",
					   		"GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE",
							"SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS",
							"PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS",
							"MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS",
							"IMPACT", "IMPACT_CLASS")

			for csq in projdb.consequences(join_variant=True):
				var = csq.var
				start, end, ref, alt = var_to_tab(var)
				allele = "{0}/{1}".format(ref, alt)

				uniprot = protein = protein_pos = aa_change = None
				sift_score = sift_tfic = sift_tfic_class = None
				pph2_score = pph2_tfic = pph2_tfic_class = None
				ma_score = ma_tfic = ma_tfic_class = None
		
				if so.match(csq.ctypes, so.ONCODRIVEFM):
					uniprot, protein = csq.uniprot, csq.protein
		
				if so.match(csq.ctypes, so.NON_SYNONYMOUS):
					protein_pos, aa_change = csq.protein_pos, csq.aa_change
					sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class)
					pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class)
					ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class)

				write_line(cf, project_id, var.chr, var.strand, start, allele, csq.transcript,
							",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene),
							uniprot, protein, protein_pos, aa_change,
							sift_score, sift_tfic, sift_tfic_class,
							pph2_score, pph2_tfic, pph2_tfic_class,
							ma_score, ma_tfic, ma_tfic_class,
							csq.impact, TransFIC.class_name(csq.impact))

		log.info("  Genes ...")

		with ArcFile(task, arc, project_id, "genes", "w") as gf:
			write_line(gf, "PROJECT_ID", "GENE_ID", "SYMBOL", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE",
							"SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP",
							"CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS",
							"INTOGEN_DRIVER", "XREFS")

			for gene in projdb.genes(join_xrefs=True, join_rec=True):
				if gene.rec.sample_freq is not None and gene.rec.sample_freq > 0:
					intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0
					write_line(gf, project_id, gene.id, gene.symbol, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause,
									gene.rec.sample_freq, total_samples, gene.rec.sample_prop or 0,
									gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause,
									gene.clust_coords, intogen_driver, ",".join(gene.xrefs))

		log.info("  Pathways ...")

		with ArcFile(task, arc, project_id, "pathways", "w") as pf:
			write_line(pf, "PROJECT_ID", "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE",
							"SAMPLE_FREQ", "SAMPLE_TOTAL", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP")

			for pathway in projdb.pathways(join_rec=True):
				if pathway.rec.sample_freq is not None and pathway.rec.sample_freq > 0:
					write_line(pf, project_id, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue,
									pathway.rec.sample_freq or 0, total_samples, pathway.rec.sample_prop or 0,
									pathway.rec.gene_freq or 0, pathway.gene_count, pathway.rec.gene_prop or 0)

		skip_oncodrivefm = conf.get("skip_oncodrivefm", False, dtype=bool)

		if not skip_oncodrivefm:

			log.info("  Genes per sample functional impact ...")

			with ArcFile(task, arc, project_id, "fimpact.gitools.tdm", "w") as f:
				write_line(f, "SAMPLE", "GENE_ID",
						   "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS",
						   "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS",
						   "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS")
				for fields in projdb.sample_gene_fimpacts():
					(gene, sample,
						 sift_score, sift_tfic, sift_tfic_class,
						 pph2_score, pph2_tfic, pph2_tfic_class,
						 ma_score, ma_tfic, ma_tfic_class) = fields
					write_line(f, sample, gene,
							   sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class),
							   pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class),
							   ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class))

		log.info("Saving project configuration ...")

		with ArcFile(task, arc, project_id, "project", "w") as f:
			names = ["PROJECT_ID", "ASSEMBLY", "SAMPLES_TOTAL"]
			values = [project_id, project["assembly"], total_samples]
			names, values = projres.get_annotations_to_save(conf, project["annotations"], names=names, values=values)
			tsv.write_line(f, *names)
			tsv.write_line(f, *values, null_value="-")
	finally:
		if arc is not None:
			arc.close()
		projdb.close()
		sigdb.close()
Пример #8
0
def prepare_files(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_results = ProjectResults(project)

	mutations_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project)

	log.info("Loading transcripts CDS length ...")

	cds_len = load_cds_len(conf)

	log.info("Retrieving gene alterations ...")

	projdb = ProjectDb(project["db"])

	data = retrieve_data(projdb, cds_len)

	projdb.close()

	data_paths = [
		os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"),
		os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")]

	log.info("Saving data ...")
	log.debug("> {0}".format(data_paths[NON_SYN]))
	log.debug("> {0}".format(data_paths[SYN]))

	df = [tsv.open(path, "w") for path in data_paths]

	gene_sample_count = {}

	for key, value in data.items():
		findex, gene, sample = key
		transcript, transcript_len, protein_pos = value

		if findex == NON_SYN:
			if gene not in gene_sample_count:
				gene_sample_count[gene] = 1
			else:
				gene_sample_count[gene] += 1

			if genes_filter_enabled and not filt.valid(gene):
				continue

		tsv.write_line(df[findex], gene, sample, protein_pos)

	for f in df:
		f.close()

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("Saving excluded gene causes ...")
	log.debug("> {0}".format(exc_path))

	with tsv.open(exc_path, "w") as exf:
		tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE")
		for gene, sample_count in gene_sample_count.items():
			causes = []
			if genes_filter_enabled and not filt.valid(gene):
				causes += [ProjectDb.GENE_EXC_FILTER]
			if sample_count < mutations_threshold:
				causes += [ProjectDb.GENE_EXC_THRESHOLD]
			if len(causes) > 0:
				tsv.write_line(exf, gene, "".join(causes))

	log.info("Sending project ...")

	projects_out_port.send(dict(project,
								oncodriveclust=dict(
									data_paths=data_paths,
									mutations_threshold=mutations_threshold,
									genes_filter_enabled=genes_filter_enabled, # not used
									genes_filter=genes_filter))) # not used
Пример #9
0
def projects(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	log.info("--- [{0}] --------------------------------------------".format(project["id"]))

	projdb = ProjectDb(project["db"])

	total_samples = projdb.get_total_affected_samples()

	if total_samples == 0:
		log.warn("There are no samples, recurrences cannot be calculated.")
		projdb.close()
		return

	log.info("Calculating project recurrences for variant genes ...")

	projdb.compute_affected_genes_recurrences(total_samples)

	if not conf.get("variants_only", False):

		log.info("Calculating project recurrences for genes ...")

		projdb.compute_gene_recurrences(total_samples)

		log.info("Calculating project recurrences for pathways ...")

		projdb.compute_pathway_recurrences(total_samples)

	projdb.commit()
	projdb.close()

	projects_out_port.send(project)
Пример #10
0
def combination_oncodrivefm(projects_set):
	log = task.logger
	conf = task.conf

	classifier, projects = projects_set

	classifier_id = classifier["id"]

	group_values = classifier["group_values"]
	short_values = classifier["group_short_values"]
	long_values = classifier["group_long_values"]

	group_name = classifier["group_name"]
	group_short_name = classifier["group_short_name"]
	group_long_name = classifier["group_long_name"]

	if len(group_values) == 0:
		group_file_prefix = classifier_id
	else:
		group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name)

	group_file_prefix = normalize_id(group_file_prefix)

	log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(classifier["name"], group_long_name, group_short_name, group_name, "-" * 30))

	log.info("Exporting project data ...")

	base_path = make_temp_dir(task, suffix=".{0}".format(group_file_prefix))

	log.debug("> {0}".format(base_path))

	project_ids = []
	gene_files = []
	pathway_files = []
	for project in projects:
		project_id = project["id"]
		project_ids += [project_id]

		log.info("  Project {0}:".format(project["id"]))

		projdb = ProjectDb(project["db"])

		log.info("    Genes ...")

		count = 0
		file_path = os.path.join(base_path, "{0}-genes.tsv".format(project_id))
		gene_files += [file_path]
		with open(file_path, "w") as f:
			tsv.write_param(f, "classifier", classifier_id)
			tsv.write_param(f, "group_id", group_name)
			tsv.write_param(f, "slice", project_id)
			tsv.write_line(f, "GENE_ID", "PVALUE")
			for gene in projdb.genes():
				if gene.fm_pvalue is not None:
					tsv.write_line(f, gene.id, gene.fm_pvalue, null_value="-")
					count += 1

		log.info("      {0} genes".format(count))

		log.info("    Pathways ...")

		count = 0
		file_path = os.path.join(base_path, "{0}-pathways.tsv".format(project_id))
		pathway_files += [file_path]
		with open(file_path, "w") as f:
			tsv.write_param(f, "classifier", classifier_id)
			tsv.write_param(f, "group_id", group_name)
			tsv.write_param(f, "slice", project_id)
			tsv.write_line(f, "PATHWAY_ID", "ZSCORE")
			for pathway in projdb.pathways():
				if pathway.fm_zscore is not None:
					tsv.write_line(f, pathway.id, pathway.fm_zscore, null_value="-")
					count += 1

		log.info("      {0} pathways".format(count))

		projdb.close()

	log.info("Combining ...")

	combination_path = get_combination_path(conf, "oncodrivefm")

	log.info("  Genes ...")

	cmd = " ".join([
			"oncodrivefm-combine",
			"-m median-empirical",
			"-o '{0}'".format(combination_path),
			"-n 'gene-{0}'".format(group_file_prefix),
			"-D 'classifier={0}'".format(classifier_id),
			"-D 'group_id={0}'".format(group_name),
			"-D 'group_short_name={0}'".format(group_short_name),
			"-D 'group_long_name={0}'".format(group_long_name),
			"--output-format tsv.gz"
	] + ["'{0}'".format(name) for name in gene_files])

	log.debug(cmd)

	ret_code = subprocess.call(cmd, shell=True)
	if ret_code != 0:
		#log.error("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd))
		#return -1
		raise Exception("OncodriveFM error while combining gene pvalues:\n{0}".format(cmd))

	log.info("  Pathways ...")

	cmd = " ".join([
			"oncodrivefm-combine",
			"-m median-zscore",
			"-o '{0}'".format(combination_path),
			"-n 'pathway-{0}'".format(group_file_prefix),
			"-D 'classifier={0}'".format(classifier_id),
			"-D 'group_id={0}'".format(group_name),
			"-D 'group_short_name={0}'".format(group_short_name),
			"-D 'group_long_name={0}'".format(group_long_name),
			"--output-format tsv.gz"
	] + ["'{0}'".format(name) for name in pathway_files])

	log.debug(cmd)

	ret_code = subprocess.call(cmd, shell=True)
	if ret_code != 0:
		#log.error("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd))
		#return -1
		raise Exception("OncodriveFM error while combining pathway zscores:\n{0}".format(cmd))

	remove_temp(task, base_path)
Пример #11
0
def end():
    log = task.logger

    projects_out_port = task.ports("projects_out")

    log.info("Updating the projects database ...")

    for project_id, projects in task.context.items():

        log.info("[{0}]".format(project_id))

        for index, project in enumerate(projects):
            projdb = ProjectDb(project["db"])

            if index == 0:
                log.info("  Functional impact ...")

                projdb.delete_sample_gene_fimpact()

                with tsv.open(project["sample_gene_fi_data"], "r") as f:
                    types = (int, str, float, float, int, float, float, int, float, float, int)
                    for fields in tsv.lines(f, types, header=True, null_value="-"):
                        projdb.add_sample_gene_fimpact(*fields)

            ofm = project["oncodrivefm"]
            del project["oncodrivefm"]

            exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv")

            log.info("  Excluded gene causes ...")
            log.debug("    > {0}".format(exc_path))

            count = 0
            with tsv.open(exc_path, "r") as exf:
                for gene, cause in tsv.lines(exf, (str, str), header=True):
                    projdb.update_gene(Gene(id=gene, fm_exc_cause=cause))
                    count += 1

            log.debug("    {0} genes excluded".format(count))

            for feature, results_path in ofm:

                log.info("  {0} ...".format(feature))
                log.debug("    > {0}".format(results_path))

                if feature == "genes":
                    with tsv.open(results_path, "r") as f:
                        count = 0
                        for gene, pvalue, qvalue in tsv.lines(f, (str, float, float), header=True):
                            projdb.update_gene(
                                Gene(id=gene, fm_pvalue=pvalue, fm_qvalue=qvalue, fm_exc_cause=ProjectDb.NO_GENE_EXC)
                            )
                            count += 1
                        log.info("    {0} genes".format(count))
                elif feature == "pathways":
                    with tsv.open(results_path, "r") as f:
                        count = 0
                        for pathway, zscore, pvalue, qvalue in tsv.lines(f, (str, float, float, float), header=True):
                            projdb.update_pathway(
                                Pathway(id=pathway, fm_zscore=zscore, fm_pvalue=pvalue, fm_qvalue=qvalue)
                            )
                            count += 1
                        log.info("    {0} pathways".format(count))

            projdb.commit()

            projdb.close()

        projects_out_port.send(projects[0])
Пример #12
0
def scan_files(project):
	log = task.logger
	conf = task.conf

	projects_port, liftover_projects_port = task.ports("projects_out", "liftover_projects")

	project_id = project["id"]
	temp_path = project["temp_path"]
	project_path = project["path"]
	projdb_path = project["db"]
	assembly = project["assembly"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	if assembly == "hg18":
		out_port = liftover_projects_port
	elif assembly == "hg19":
		out_port = projects_port
	else:
		raise Exception("Unexpected assembly: {0}".format(assembly))

	#if os.path.exists(projdb_path):
	#	log.warn("Variations database already created, skipping this step.")
	#	out_port.send(project)
	#	return

	if os.path.exists(projdb_path):
		os.remove(projdb_path)

	log.info("Creating variants database ...")

	projdb_tmp_path = make_temp_file(task, suffix=".db")

	log.debug(projdb_tmp_path)

	projdb = ProjectDb(projdb_tmp_path)
	projdb.create()

	data_path = conf["data_path"]

	log.info("Loading genes ...")

	projdb.load_genes(get_data_ensembl_genes_path(conf))

	log.info("Loading pathways ...")

	projdb.load_pathways(
		get_data_kegg_def_path(conf),
		get_data_kegg_ensg_map_path(conf))

	log.info("Parsing variants ...")

	for file in project["files"]:
		if not os.path.isabs(file):
			raise InternalError("Non absolute path found: {0}".format(file))

		if not os.path.exists(file):
			raise Exception("Input file not found: {0}".format(file))

		if not os.path.isfile(file):
			raise Exception("Not a file: {0}".format(file))

		for container_name, path, name, ext, f in archived_files(file):
			fname = os.path.join(path, name + ext)
			if container_name is not None:
				source_name = "{0}:{1}".format(os.path.basename(container_name), fname)
			else:
				source_name = name + ext

			log.info("=> {0} ...".format(source_name))

			sample_id = os.path.basename(name)

			if ext.lower() in _SUPPORTED_EXTENSIONS:
				parser_type = ext[1:]
			else:
				parser_type = "tab"

			parser = create_variants_parser(parser_type, f, source_name, sample_id)

			source_id = projdb.add_source(source_name)

			var_ids = set()
			for var in parser:
				for line_num, text in parser.read_lines():
					projdb.add_source_line(source_id, line_num, text)

				var_id = projdb.add_variant(var, source_id=source_id, line_num=parser.get_line_num())
				var_ids.add(var_id)

			for line_num, text in parser.read_lines():
				projdb.add_source_line(source_id, line_num, text)

			num_variants = len(var_ids)
			log.info("   {0} variants".format(num_variants))

			if num_variants == 0:
				raise Exception("No variants found in source '{}'. "
								"Please check the documentation for the expected input for '{}' format.".format(
								source_name, parser.name))

	projdb.commit()
	projdb.close()

	log.info("Copying variants database ...")

	log.debug("{0} -> {1}".format(projdb_tmp_path, projdb_path))

	shutil.copy(projdb_tmp_path, projdb_path)

	remove_temp(task, projdb_tmp_path)

	out_port.send(project)
Пример #13
0
def combination_recurrences(projects_set):
	log = task.logger
	conf = task.conf

	classifier, projects = projects_set

	classifier_id = classifier["id"]

	group_values = classifier["group_values"]
	short_values = classifier["group_short_values"]
	long_values = classifier["group_long_values"]

	group_name = classifier["group_name"]
	group_short_name = classifier["group_short_name"]
	group_long_name = classifier["group_long_name"]

	if len(group_values) == 0:
		group_file_prefix = classifier_id
	else:
		group_file_prefix = "{0}-{1}".format(classifier_id, group_short_name)

	group_file_prefix = normalize_id(group_file_prefix)

	log.info("--- [{0} ({1}) ({2}) ({3})] {4}".format(classifier["name"], group_long_name, group_short_name, group_name, "-" * 30))

	log.info("Creating database ...")

	db_path = make_temp_file(task, suffix="-{0}.db".format(group_file_prefix))
	log.debug("  > {0}".format(db_path))

	conn = sqlite3.connect(db_path)
	conn.row_factory = sqlite3.Row

	create_db(conn)

	log.info("Combining recurrences ...")

	c = conn.cursor()

	sample_total = 0

	project_ids = []
	for project in projects:
		project_ids += [project["id"]]

		log.info("  Project {0}:".format(project["id"]))

		projdb = ProjectDb(project["db"])

		project_sample_total = projdb.get_total_affected_samples()

		sample_total += project_sample_total

		log.info("    Total samples = {0}".format(project_sample_total))

		log.info("    Variant genes ...")

		count = 0
		for afg in projdb.affected_genes(join_variant=True, join_xrefs=True, join_rec=True):
			var = afg.var
			rec = afg.rec

			if rec.sample_freq is None:
				log.warn("Discarding variant gene without sample frequency: {0}".format(repr(afg)))
				continue

			start, end, ref, alt = var_to_tab(var)

			try:
				c.execute("INSERT INTO variants (chr, strand, start, ref, alt, xrefs) VALUES (?,?,?,?,?,?)",
						  (var.chr, var.strand, start, ref, alt, ",".join(var.xrefs)))
				var_id = c.lastrowid
			except sqlite3.IntegrityError:
				c.execute("SELECT var_id FROM variants WHERE chr=? AND strand=? AND start=? AND ref=? AND alt=?",
						  (var.chr, var.strand, start, ref, alt))
				r = c.fetchone()
				var_id = r[0]

			try:
				c.execute("INSERT INTO variant_genes (var_id, gene_id, impact, coding_region, prot_changes, sample_freq) VALUES (?,?,?,?,?,?)",
						  (var_id, afg.gene_id, afg.impact, afg.coding_region, afg.prot_changes, rec.sample_freq))
			except sqlite3.IntegrityError:
				c.execute("""
					UPDATE variant_genes
					SET sample_freq=sample_freq + ?
					WHERE var_id=? AND gene_id=?""",
						(rec.sample_freq, var_id, afg.gene_id))

			count += 1

		log.info("      {0} variant genes".format(count))

		log.info("    Genes ...")

		count = 0
		for gene in projdb.genes(join_xrefs=True, join_rec=True):
			rec = gene.rec

			if rec.sample_freq is None:
				continue

			c.execute("SELECT COUNT(*) FROM genes WHERE gene_id=?", (gene.id,))
			r = c.fetchone()
			if r[0] == 0:
				c.execute("INSERT INTO genes (gene_id, sample_freq) VALUES (?,?)",
					  (gene.id, rec.sample_freq))
			else:
				c.execute("UPDATE genes SET sample_freq=sample_freq + ? WHERE gene_id=?",
						  (rec.sample_freq, gene.id))
			count += 1

		log.info("      {0} genes".format(count))

		log.info("    Pathways ...")

		count = 0
		for pathway in projdb.pathways(join_rec=True):
			rec = pathway.rec

			if rec.sample_freq is None:
				continue

			c.execute("SELECT COUNT(*) FROM pathways WHERE pathway_id=?", (pathway.id,))
			r = c.fetchone()
			if r[0] == 0:
				c.execute("INSERT INTO pathways (pathway_id, sample_freq) VALUES (?,?)",
						  (pathway.id, rec.sample_freq))
			else:
				c.execute("UPDATE pathways SET sample_freq=sample_freq + ? WHERE pathway_id=?",
						  (rec.sample_freq, pathway.id))
			count += 1

		log.info("      {0} pathways".format(count))

		projdb.close()

	log.info("Calculating proportions with {0} samples in total among projects ...".format(sample_total))

	if sample_total > 0:
		c.execute("UPDATE variant_genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))
		c.execute("UPDATE genes SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))
		c.execute("UPDATE pathways SET sample_prop=CAST(sample_freq AS REAL)/{0}.0".format(sample_total))

	c.close()
	conn.commit()
	
	log.info("Saving results ...")
	
	c = conn.cursor()

	base_path = get_combination_path(conf, "recurrences")

	log.info("  Variant genes ...")

	with tsv.open(os.path.join(base_path, "variant_gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
		tsv.write_param(f, "classifier", classifier["id"])
		tsv.write_param(f, "group_id", group_name)
		tsv.write_param(f, "group_short_name", group_short_name)
		tsv.write_param(f, "group_long_name", group_long_name)
		tsv.write_param(f, "projects", ",".join(project_ids))
		tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
		tsv.write_line(f, "CHR", "STRAND", "START", "ALLELE", "GENE_ID", "IMPACT", "IMPACT_CLASS", "SAMPLE_FREQ", "SAMPLE_PROP", "PROT_CHANGES", "XREFS")
		for r in c.execute("SELECT * FROM variant_genes JOIN variants USING (var_id) ORDER BY chr*1, chr, strand, start, gene_id"):
			strand, ref, alt = r["strand"], r["ref"], r["alt"]
			allele = "{0}/{1}".format(ref, alt)
			tsv.write_line(f, r["chr"], strand, r["start"], allele,
						   r["gene_id"], r["impact"], TransFIC.class_name(r["impact"]),
						   r["sample_freq"], r["sample_prop"], r["prot_changes"], r["xrefs"], null_value="-")
			
	log.info("  Genes ...")

	with tsv.open(os.path.join(base_path, "gene-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
		tsv.write_param(f, "classifier", classifier["id"])
		tsv.write_param(f, "group_id", group_name)
		tsv.write_param(f, "group_short_name", group_short_name)
		tsv.write_param(f, "group_long_name", group_long_name)
		tsv.write_param(f, "projects", ",".join(project_ids))
		tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
		tsv.write_line(f, "GENE_ID", "SAMPLE_FREQ", "SAMPLE_PROP")
		for r in c.execute("SELECT * FROM genes ORDER BY gene_id"):
			tsv.write_line(f, r["gene_id"], r["sample_freq"], r["sample_prop"], null_value="-")

	log.info("  Pathways ...")

	with tsv.open(os.path.join(base_path, "pathway-{0}.tsv.gz".format(group_file_prefix)), "w") as f:
		tsv.write_param(f, "classifier", classifier["id"])
		tsv.write_param(f, "group_id", group_name)
		tsv.write_param(f, "group_short_name", group_short_name)
		tsv.write_param(f, "group_long_name", group_long_name)
		tsv.write_param(f, "projects", ",".join(project_ids))
		tsv.write_param(f, "SAMPLE_TOTAL", sample_total)
		tsv.write_line(f, "PATHWAY_ID", "SAMPLE_FREQ", "SAMPLE_PROP")
		for r in c.execute("SELECT * FROM pathways ORDER BY pathway_id"):
			tsv.write_line(f, r["pathway_id"], r["sample_freq"], r["sample_prop"], null_value="-")
			
	conn.close()

	remove_temp(task, db_path)
Пример #14
0
def prepare_files(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_results = ProjectResults(project)

	projdb = ProjectDb(project["db"])

	log.info("Retrieving functional impact scores for genes ...")

	data = retrieve_data(projdb)

	projdb.close()

	# save data matrix

	dst_path = os.path.join(project["temp_path"], "oncodrivefm-data.tdm")
	sgfi_path = os.path.join(project["temp_path"], "sample_gene-fimpact.tsv.gz")
	project["sample_gene_fi_data"] = sgfi_path

	log.info("Saving functional impact scores ...")
	log.debug("> {0}".format(dst_path))

	with open(dst_path, "w") as f:
		sgff = tsv.open(sgfi_path, "w")

		tsv.write_line(f, "SAMPLE", "GENE", "SIFT", "PPH2", "MA")
		tsv.write_line(sgff, "SAMPLE", "GENE",
					   "SIFT_SCORE", "SIFT_TFIC", "SIFT_TFIC_CLASS",
					   "PPH2_SCORE", "PPH2_TFIC", "PPH2_TFIC_CLASS",
					   "MA_SCORE", "MA_TFIC", "MA_TFIC_CLASS")

		for key, values in data.iteritems():
			sample, gene = key

			(sift_score, sift_tfic, sift_tfic_class,
				pph2_score, pph2_tfic, pph2_tfic_class,
				ma_score, ma_tfic, ma_tfic_class) = values

			tsv.write_line(f, sample, gene, sift_score, pph2_score, ma_score)
			tsv.write_line(sgff, sample, gene,
						   sift_score, sift_tfic, sift_tfic_class,
						   pph2_score, pph2_tfic, pph2_tfic_class,
						   ma_score, ma_tfic, ma_tfic_class, null_value="-")

		sgff.close()

	# count samples

	samples = set()
	gene_sample_count = {}
	for sample, gene in data.keys():
		samples.add(sample)
		if gene not in gene_sample_count:
			gene_sample_count[gene] = 1
		else:
			gene_sample_count[gene] += 1

	num_samples = len(samples)

	if num_samples == 0:
		log.warn("There are no samples data, skipping OncodriveFM for this project")
		return

	(num_cores, estimator,
		genes_num_samplings, genes_threshold, genes_filter_enabled, genes_filter, filt,
		pathways_num_samplings, pathways_threshold) = get_oncodrivefm_configuration(log, conf, project, num_samples)

	# Create a dataset with information on why some genes are not considered for calculation in OncodriveFM
	# There are basically two possible reasons:
	# - It does not pass the filter
	# - There are less samples mutated than the threshold

	exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv")

	log.info("Saving excluded gene causes ...")
	log.debug("> {0}".format(exc_path))

	with tsv.open(exc_path, "w") as exf:
		tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE")
		for gene, sample_count in gene_sample_count.items():
			causes = []
			if genes_filter_enabled and not filt.valid(gene):
				causes += [ProjectDb.GENE_EXC_FILTER]
			if sample_count < genes_threshold:
				causes += [ProjectDb.GENE_EXC_THRESHOLD]
			if len(causes) > 0:
				tsv.write_line(exf, gene, "".join(causes))

	ofm = dict(
		data=dst_path,
		num_cores=num_cores,
		estimator=estimator)

	for slice_name in ["SIFT", "PPH2", "MA"]:
		projects_out_port.send(dict(project,
			oncodrivefm=dict(ofm,
				 feature="genes",
				 slice=slice_name,
				 num_samplings=genes_num_samplings,
				 threshold=genes_threshold,
				 filter_enabled=genes_filter_enabled,
				 filter=genes_filter)))

	for slice_name in ["SIFT", "PPH2", "MA"]:
		projects_out_port.send(dict(project,
			oncodrivefm=dict(ofm,
				 feature="pathways",
				 slice=slice_name,
				 num_samplings=pathways_num_samplings,
				 threshold=pathways_threshold,
				 filter_enabled=genes_filter_enabled,
				 filter=genes_filter)))
Пример #15
0
def datasets(project):
	log = task.logger
	conf = task.conf

	project_id = project["id"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_path = project["path"]
	temp_path = project["temp_path"]

	datasets_path = get_website_results_path(project_path)
	if not os.path.exists(datasets_path):
		os.makedirs(datasets_path)

	sigdb = SigDb(conf["sigdb_path"])
	sigdb.open()

	projdb = ProjectDb(project["db"])

	gene_sym = projdb.get_gene_symbols()

	total_samples = projdb.get_total_affected_samples()

	log.info("Exporting variant genes ...")

	vf = open_dataset(project_id, project_path, datasets_path, "variant_gene", "w", log)
	tsv.write_param(vf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(vf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE",
					"GENE_ID", "IMPACT", "IMPACT_CLASS",
					"SAMPLE_FREQ", "SAMPLE_PROP",
					"CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS")

	sf = open_dataset(project_id, project_path, datasets_path, "variant-samples", "w", log)
	tsv.write_line(sf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLE")

	count = 0
	for afg in projdb.affected_genes(join_variant=True, join_samples=True, join_xrefs=True, join_rec=True):
		var = afg.var
		rec = afg.rec

		start, end, ref, alt = var_to_tab(var)

		allele = "{0}/{1}".format(ref, alt)

		xrefs = [xref for xref in var.xrefs]
		if sigdb.exists_variant(var.chr, start):
			xrefs += ["I:1"]
		xrefs = ",".join(xrefs)

		intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0

		tsv.write_line(vf, var.id, var.chr, var.strand, start, allele,
						afg.gene_id, afg.impact, TransFIC.class_name(afg.impact),
						rec.sample_freq, rec.sample_prop,
						afg.coding_region, afg.prot_changes, intogen_driver, xrefs, null_value="\N")

		for sample in var.samples:
			tsv.write_line(sf, var.id, var.chr, var.strand, start, allele, sample.name, null_value="\N")

		count += 1

	vf.close()
	sf.close()

	log.info("  {0} variant genes".format(count))

	log.info("Exporting consequences ...")

	cf = open_dataset(project_id, project_path, datasets_path, "consequence", "w", log)
	tsv.write_line(cf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID",
				   "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE",
					"SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS",
					"PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS",
					"MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS",
					"IMPACT", "IMPACT_CLASS")

	count = 0
	for csq in projdb.consequences(join_variant=True):
		var = csq.var
		start, end, ref, alt = var_to_tab(var)

		allele = "{0}/{1}".format(ref, alt)

		uniprot = protein = protein_pos = aa_change = None
		sift_score = sift_tfic = sift_tfic_class = None
		pph2_score = pph2_tfic = pph2_tfic_class = None
		ma_score = ma_tfic = ma_tfic_class = None

		if so.match(csq.ctypes, so.ONCODRIVEFM):
			uniprot, protein = csq.uniprot, csq.protein

		if so.match(csq.ctypes, so.NON_SYNONYMOUS):
			protein_pos, aa_change = csq.protein_pos, csq.aa_change
			sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class)
			pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class)
			ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class)

		tsv.write_line(cf, var.id, var.chr, var.strand, start, allele, csq.transcript,
						",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene),
						uniprot, protein, protein_pos, aa_change,
						sift_score, sift_tfic, sift_tfic_class,
						pph2_score, pph2_tfic, pph2_tfic_class,
						ma_score, ma_tfic, ma_tfic_class,
						csq.impact, TransFIC.class_name(csq.impact), null_value="\N")
		count += 1

	cf.close()

	log.info("  {0} consequences".format(count))

	log.info("Exporting genes ...")

	gf = open_dataset(project_id, project_path, datasets_path, "gene", "w", log)
	tsv.write_param(gf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(gf, "GENE_ID", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE",
				   "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS",
				   "SAMPLE_FREQ", "SAMPLE_PROP", "INTOGEN_DRIVER")


	for gene in projdb.genes(join_rec=True):
		rec = gene.rec

		if rec.sample_freq is None or rec.sample_freq == 0:
			continue

		intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0

		tsv.write_line(gf, gene.id, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause,
					   gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords,
					   rec.sample_freq or 0, rec.sample_prop or 0,
					   intogen_driver, null_value="\N")

	gf.close()

	log.info("Exporting pathways ...")

	pf = open_dataset(project_id, project_path, datasets_path, "pathway", "w", log)
	tsv.write_param(pf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(pf, "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE",
				   "SAMPLE_FREQ", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP")

	for pathway in projdb.pathways(join_rec=True):
		rec = pathway.rec

		if rec.sample_freq is None or rec.sample_freq == 0:
			continue

		tsv.write_line(pf, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue,
						rec.sample_freq or 0, rec.sample_prop or 0, rec.gene_freq or 0, pathway.gene_count, rec.gene_prop or 0, null_value="\N")

	pf.close()

	skip_oncodrivefm = conf.get("skip_oncodrivefm", False, dtype=bool)

	if not skip_oncodrivefm:

		log.info("Exporting genes per sample functional impact ...")

		with open_dataset(project_id, project_path, datasets_path, "gene_sample-fimpact", "w", log) as f:
			tsv.write_line(f, "GENE_ID", "SAMPLE",
					   "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS",
					   "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS",
					   "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS")

			for fields in projdb.sample_gene_fimpacts():
				(gene, sample,
					sift_score, sift_tfic, sift_tfic_class,
					pph2_score, pph2_tfic, pph2_tfic_class,
					ma_score, ma_tfic, ma_tfic_class) = fields
				tsv.write_line(f, gene, sample,
						   sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class),
						   pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class),
						   ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class), null_value="\N")

	projdb.close()

	sigdb.close()

	log.info("Saving project configuration ...")

	projres = ProjectResults(project)

	with open_dataset(project_id, project_path, datasets_path, "project.tsv", "w", log) as f:
		names = ["ASSEMBLY", "SAMPLES_TOTAL"]
		values = [project["assembly"], total_samples]
		names, values = projres.get_annotations_to_save(conf, project["annotations"], names=names, values=values)
		tsv.write_line(f, *names)
		tsv.write_line(f, *values, null_value="\N")

	projects_port = task.ports("projects_out")
	projects_port.send(project)
Пример #16
0
def liftover(project):
	log = task.logger
	conf = task.conf

	lifted_project_port = task.ports("lifted_projects")

	log.info("--- [{0}] --------------------------------------------".format(project["id"]))

	log.info("Preparing liftOver files ...")

	in_path = make_temp_file(task, suffix=".bed")
	in_file = open(in_path, "w")
	out_path = make_temp_file(task, suffix=".bed")
	unmapped_path = os.path.join(project["temp_path"], "liftover_unmapped.bed")

	projdb = ProjectDb(project["db"])

	for var in projdb.variants(order_by="position"):
		in_file.write(tsv.line_text("chr" + var.chr, var.start, var.start + len(var.ref), var.id))

	in_file.close()

	log.info("Running liftOver ...")

	project["from_assembly"] = project["assembly"]
	project["assembly"] = "hg19"

	cmd = " ".join([
		conf["liftover_bin"],
		in_path,
		os.path.join(conf["liftover_chain_path"], "hg18ToHg19.over.chain"),
		out_path,
		unmapped_path
	])

	log.debug(cmd)

	subprocess.call(cmd, shell=True)

	log.info("Annotating unmapped variants ...")

	count = 0
	with open(unmapped_path, "r") as f:
		for line in f:
			if line.lstrip().startswith("#"):
				continue
			fields = line.rstrip().split("\t")
			var_id = int(fields[3])
			projdb.update_variant_start(var_id, start=None)
			count += 1

	log.info("  {0} unmapped variants annotated".format(count))

	log.info("Updating variants ...")

	count = 0
	with open(out_path, "r") as f:
		for line in f:
			fields = line.rstrip().split("\t")
			chr, start, end, var_id = fields
			projdb.update_variant_start(var_id, start=start)
			count += 1

	log.info("  {0} variants".format(count))

	remove_temp(task, in_path, out_path)

	projdb.commit()
	projdb.close()

	lifted_project_port.send(project)