Пример #1
0
Файл: wait.py Проект: bbglab/wok
def process(time):
	#delay = task.conf.get("delay", 1, dtype=int)
	#min_delay = task.conf.get("min_delay", delay, dtype=int)
	#max_delay = task.conf.get("max_delay", delay, dtype=int)
	#max_delay = max(min_delay, max_delay)
	#delay = min_delay + (max_delay - min_delay) * random()

	task.logger.info("{}: {}".format(type(time), repr(time)))
	#if time > 1.5:
	#	return -1
	task.logger.info("Waiting for {:.2} seconds ...".format(time))
	sleep(time)

	task.ports("time2").send(time)
Пример #2
0
def process(time):
    #delay = task.conf.get("delay", 1, dtype=int)
    #min_delay = task.conf.get("min_delay", delay, dtype=int)
    #max_delay = task.conf.get("max_delay", delay, dtype=int)
    #max_delay = max(min_delay, max_delay)
    #delay = min_delay + (max_delay - min_delay) * random()

    task.logger.info("{}: {}".format(type(time), repr(time)))
    #if time > 1.5:
    #	return -1
    task.logger.info("Waiting for {:.2} seconds ...".format(time))
    sleep(time)

    task.ports("time2").send(time)
Пример #3
0
def projects(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	log.info("--- [{0}] --------------------------------------------".format(project["id"]))

	projdb = ProjectDb(project["db"])

	total_samples = projdb.get_total_affected_samples()

	if total_samples == 0:
		log.warn("There are no samples, recurrences cannot be calculated.")
		projdb.close()
		return

	log.info("Calculating project recurrences for variant genes ...")

	projdb.compute_affected_genes_recurrences(total_samples)

	if not conf.get("variants_only", False):

		log.info("Calculating project recurrences for genes ...")

		projdb.compute_gene_recurrences(total_samples)

		log.info("Calculating project recurrences for pathways ...")

		projdb.compute_pathway_recurrences(total_samples)

	projdb.commit()
	projdb.close()

	projects_out_port.send(project)
Пример #4
0
def compute(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]

	ofm = project["oncodrivefm"]
	feature = ofm["feature"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project_id, feature))

	cmd = [
		"oncodrivefm-combine",
		"-o", project["temp_path"],
		"-n oncodrivefm-{0}".format(feature)]

	cmd += ofm["data"]

	ofm["results"] = os.path.join(project["temp_path"], "oncodrivefm-{0}.tsv".format(feature))

	cmd = " ".join(cmd)

	log.debug(cmd)

	ret_code = subprocess.call(cmd, shell=True)
	if ret_code != 0:
		raise Exception("OncodriveFM error while combining {0}:\n{1}".format(feature, cmd))

	projects_out_port.send(project)
Пример #5
0
def ma_run(partition):
	log = task.logger
	conf = task.conf

	results_port = task.ports("results")

	project = partition["project"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"]))

	offline = conf["offline"]

	if offline == "yes":
		log.info("Running Mutation assessor in local mode.")

		ma = MaLocal(conf["ma_cache_path"])
	else:
		log.info("Running Mutation assessor using web services.")

		ma = MaService(project["assembly"], cache_path=os.path.join(conf["cache_path"], "ma.db"))

	results_path = os.path.join(partition["base_path"], "{0:08d}.ma".format(partition["index"]))

	if not os.path.exists(results_path) or conf.get("consequences_overwrite", True):

		log.info("Querying Mutation assessor for 'missense_variant' consequences ...")

		projdb = ProjectDb(project["db"])

		missense_variants = set()

		with open(partition["vep_path"], "r") as f:
			for line in f:
				fields = line.rstrip().split("\t")
				var_id = int(fields[0])
				ctypes = fields[3].split(",")
				if so.match(ctypes, so.NON_SYNONYMOUS):
					missense_variants.add(var_id)

		with open(results_path, "w") as mf:
			for var_id in missense_variants:
				var = projdb.get_variant(var_id)

				start, end, ref, alt = var_to_tab(var)

				r = ma.get(var.chr, var.strand, start, ref, alt, var_id)
				if r is not None:
					tsv.write_line(mf, var_id, r.uniprot, r.fi_score, null_value="-")

		projdb.close()

	else:
		log.warn("Skipping MA, results already exist.")
		log.debug("MA results: {0}".format(results_path))

	ma.close()

	# Send results to the next module
	partition["ma_path"] = results_path
	results_port.send(partition)
Пример #6
0
def end():
	log = task.logger
	conf = task.conf

	project_sets_port = task.ports("project_sets")

	classifiers = task.context["classifiers"]
	results = task.context["results"]

	log.info("Classification results ...")

	for index, classifier in enumerate(classifiers):
		log.info("  {0}:".format(classifier["name"]))
		cresults = results[index]
		groups = sorted(cresults.keys())
		for group_values in groups:
			group_short_values, group_long_values, projects = cresults[group_values]

			group_name = str_join("-", group_values)
			group_short_name = str_join("-", group_short_values)
			group_long_name = str_join("; ", group_long_values)

			projects_set = (dict(classifier,
								 group_name=group_name,
								 group_values=group_values,
								 group_short_name=group_short_name,
								 group_short_values=group_short_values,
								 group_long_name=group_long_name,
								 group_long_values=group_long_values),
							projects)

			log.info("    ({0}) -> {1} projects".format(group_name, len(projects)))
			project_sets_port.send(projects_set)
Пример #7
0
def create(project):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	project_id = project["id"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_path = project["path"]
	temp_path = project["temp_path"]

	templates_path = config.website.templates_path
	if templates_path is None:
		log.warn("No website templates have been defined in the configuration. Skipping website creation.")
		return

	log.info("Creating website ...")

	website_path = paths.project_website_path(project_path)
	if os.path.exists(website_path):
		shutil.rmtree(website_path)

	log.info("Copying templates ...")

	shutil.copytree(templates_path, website_path)

	gitignore_path = os.path.join(website_path, ".gitignore")
	try:
		os.remove(gitignore_path)
	except:
		pass

	log.info("Expanding templates ...")

	vars = dict(
		PROJECT_NAME=project_id,
		SHOW_ALL_TABS=not config.variants_only)

	tmpl_paths = [
		os.path.join(website_path, "css", "header.html"),
		os.path.join(website_path, "onexus-project.onx")
	]

	for path in tmpl_paths:
		with open(path, "r") as f:
			t = Template(f.read())
		with open(path, "w") as f:
			f.write(t.safe_substitute(vars))

	# Creating a soft link to the results folder
	project_results_path = paths.project_results_path(project_path)
	os.symlink(project_results_path, os.path.join(website_path, "results"))

	# Send project to the next modules

	projects_port = task.ports("projects_out")
	project["website"] = website_path
	projects_port.send(project)
Пример #8
0
def square(x):

    square = x * x
    task.logger.info("x = {0}, x^2 = {1}".format(x, square))

    x_square_port = task.ports("x_square")
    x_square_port.send(square)
Пример #9
0
def update_db(project):
	log = task.logger

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	ofm = project["oncodrivefm"]

	task.context[project_id] += [project]
Пример #10
0
def main():
    values, result = task.ports("x", "sum")

    count = 0
    nsum = 0
    for v in values:
        count += 1
        nsum += v

    task.logger.info("Sum of {0} numbers = {1}".format(count, nsum))

    result.send(nsum)
Пример #11
0
Файл: sum.py Проект: bbglab/wok
def main():
	values, result = task.ports("x", "sum")

	count = 0
	nsum = 0
	for v in values:
		count += 1
		nsum += v

	task.logger.info("Sum of {0} numbers = {1}".format(count, nsum))
	
	result.send(nsum)
Пример #12
0
def route(project):
	log = task.logger
	conf = task.conf

	projects_port = task.ports("projects_out")
	recurrences_projects_port = task.ports("recurrences_projects")
	oncodrivefm_projects_port = task.ports("oncodrivefm_projects")
	oncodriveclust_projects_port = task.ports("oncodriveclust_projects")

	log.info("  {0}".format(project["id"]))

	projects_port.send(project)

	if not task.context["skip_recurrences"]:
		recurrences_projects_port.send(project)

	if not task.context["skip_oncodrivefm"]:
		oncodrivefm_projects_port.send(project)

	if not task.context["skip_oncodriveclust"]:
		oncodriveclust_projects_port.send(project)
Пример #13
0
def end():
	log = task.logger

	projects_out_port = task.ports("projects_out")

	log.info("Sending projects ...")

	combinations = task.context["combinations"]

	for key, project in sorted(combinations.items(), key=lambda v: v[0]):
		log.debug(key)
		projects_out_port.send(project)
Пример #14
0
def route(project):
	log = task.logger

	config = GlobalConfig(task.conf)

	projects_port = task.ports("projects_out")
	recurrences_projects_port = task.ports("recurrences_projects")
	oncodrivefm_projects_port = task.ports("oncodrivefm_projects")
	oncodriveclust_projects_port = task.ports("oncodriveclust_projects")

	log.info("  {0}".format(project["id"]))

	projects_port.send(project)

	if not config.skip_recurrences:
		recurrences_projects_port.send(project)

	if not config.skip_oncodrivefm:
		oncodrivefm_projects_port.send(project)

	if not config.skip_oncodriveclust:
		oncodriveclust_projects_port.send(project)
Пример #15
0
def update_db(project):
    log = task.logger
    conf = task.conf

    projects_out_port = task.ports("projects_out")

    project_id = project["id"]
    ofm = project["oncodrivefm"]

    if project_id in task.context:
        task.context[project_id] += [project]
    else:
        task.context[project_id] = [project]
Пример #16
0
def vep_run(partition):
	log = task.logger
	conf = task.conf

	results_port = task.ports("results")

	project_id = partition["project"]["id"]
	log.info("--- [{0} @ {1}] --------------------------------------------".format(project_id, partition["index"]))

	offline = conf["offline"]

	if offline == "yes":
		log.info("Running VEP in local mode.")

		vep = VepLocal(
				perl_path=conf["perl_bin"],
				lib_path=conf["perl_lib"],
				script_path=os.path.join(conf["ext_bin_path"], "variant_effect_predictor", "variant_effect_predictor.pl"),
				cache_path=os.path.join(conf["data_path"], "vep_cache"))
	else:
		log.info("Running VEP using web services.")

		vep = VepService(cache_path=os.path.join(conf["cache_path"], "vep.db"))

	results_path = os.path.join(partition["base_path"], "{0:08d}.vep".format(partition["index"]))

	if not os.path.exists(results_path) or conf.get("consequences_overwrite", True):
		# Run VEP
		vep.run(partition["bed_path"])

		log.info("Saving results ...")
		log.debug("VEP results: {0}".format(vep.results_path))

		# Save results

		with open(results_path, "w") as f:
			for r in vep.results():
				tsv.write_line(f, r.var_id, r.gene, r.transcript,
								",".join(r.consequences),
								r.protein_pos, r.aa_change, r.protein,
								r.sift, r.polyphen, null_value="-")

	else:
		log.warn("Skipping VEP, results already exist.")
		log.debug("VEP results: {0}".format(results_path))

	vep.close()

	# Send results to the next module
	partition["vep_path"] = results_path
	results_port.send(partition)
Пример #17
0
def end():
    log = task.logger

    projects_port = task.ports("projects")

    for project_id, partitions in task.context.items():
        log.info("Project {0}: {1} partitions".format(project_id, len(partitions)))

        project = partitions[0]["project"]
        project["partitions"] = part_list = list()
        for part in partitions:
            del part["project"]
            part_list += [part]

        projects_port.send(project)
Пример #18
0
Файл: sum.py Проект: bbglab/wok
def main():

	values, count_port, sum_port = task.ports("x", "count", "sum")

	count = 0
	sum = 0
	for v in values:
		task.logger.info("value = {0}".format(v))
		count += 1
		sum += v

	task.logger.info("Sum of {0} numbers = {1}".format(count, sum))

	count_port.send(count)
	sum_port.send(sum)
Пример #19
0
def main():

    values, count_port, sum_port = task.ports("x", "count", "sum")

    count = 0
    sum = 0
    for v in values:
        task.logger.info("value = {0}".format(v))
        count += 1
        sum += v

    task.logger.info("Sum of {0} numbers = {1}".format(count, sum))

    count_port.send(count)
    sum_port.send(sum)
Пример #20
0
def split_variants(project):
    log = task.logger

    config = GlobalConfig(task.conf)

    partition_port = task.ports("partitions")

    log.info("--- [{}] --------------------------------------------".format(project["id"]))

    projdb = ProjectDb(project["db"])

    log.info("Preparing variants for VEP ...")

    base_path = os.path.join(project["temp_path"], "consequences")

    ensure_path_exists(base_path)

    project["csq_path"] = base_path

    partition_size = config.vep_partition_size
    partition = -1
    f = None

    count = 0
    for var in projdb.variants(order_by="position"):
        start, end, ref, alt = var_to_tab(var)

        if count % partition_size == 0:
            if f is not None:
                f.close()

            partition += 1
            partition_path = os.path.join(base_path, "{0:08d}.vep_in".format(partition))
            f = open(partition_path, "w")
            partition_port.send(
                {"project": project, "index": partition, "bed_path": partition_path, "base_path": base_path}
            )

        tsv.write_line(f, var.chr, start, end, ref + "/" + alt, var.strand, var.id)

        count += 1

    if f is not None:
        f.close()

    log.info("{} variants split into {} partitions".format(count, partition + 1))

    projdb.close()
Пример #21
0
def create(project):
	log = task.logger
	conf = task.conf

	project_id = project["id"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_path = project["path"]
	temp_path = project["temp_path"]

	templates_path = conf.get("website.templates_path")
	if templates_path is None:
		log.warn("No website templates have been defined in the configuration. Skipping website creation.")
		return

	log.info("Creating website ...")

	website_path = get_website_path(project_path)
	if os.path.exists(website_path):
		shutil.rmtree(website_path)

	log.info("Copying templates ...")

	shutil.copytree(templates_path, website_path)

	log.info("Expanding templates ...")

	vars = dict(
		PROJECT_NAME=project_id,
		SHOW_ALL_TABS=not conf.get("variants_only", False))

	paths = [
		os.path.join(website_path, "css", "header.html"),
		os.path.join(website_path, "onexus-project.onx")
	]

	for path in paths:
		with open(path, "r") as f:
			t = Template(f.read())
		with open(path, "w") as f:
			f.write(t.safe_substitute(vars))

	# Send project to the next modules

	projects_port = task.ports("projects_out")
	project["website"] = website_path
	projects_port.send(project)
Пример #22
0
def update_db(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	oclust = project["oncodriveclust"]
	del project["oncodriveclust"]

	if not os.path.exists(oclust["results"]):
		log.warn("No results have been found. Skipping it.")
		return

	log.info("Updating the project database ...")

	projdb = ProjectDb(project["db"])

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("  Excluded gene causes ...")
	log.debug("    > {0}".format(exc_path))

	count = 0
	with tsv.open(exc_path, "r") as exf:
		for gene, cause in tsv.lines(exf, (str, str), header=True):
			projdb.update_gene(Gene(id=gene, clust_exc_cause=cause))
			count += 1

	log.debug("    {0} genes excluded".format(count))

	log.info("  OncodriveCLUST results ...")

	with tsv.open(oclust["results"], "r") as f:
		types = (str, str, float, float, float)
		columns = ("GENE", "CLUST_COORDS", "ZSCORE", "PVALUE", "QVALUE")
		for gene, coords, zscore, pvalue, qvalue in tsv.lines(f, types, columns=columns, header=True, null_value="NA"):
			projdb.update_gene(Gene(id=gene, clust_coords=coords, clust_zscore=zscore, clust_pvalue=pvalue,
									clust_qvalue=qvalue, clust_exc_cause=ProjectDb.NO_GENE_EXC))

	projdb.commit()

	projdb.close()

	projects_out_port.send(project)
Пример #23
0
def oncoclust(project):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	gene_transcripts_path = paths.data_ensembl_gene_transcripts_path()

	oclust = project["oncodriveclust"]

	data_paths = oclust["data_paths"]

	samples_threshold = oclust["samples_threshold"]

	oclust["results"] = os.path.join(project["temp_path"], "oncodriveclust-results.tsv")

	cmd = " ".join([
		"oncodriveclust",
		"-c",
		"-m", str(samples_threshold),
		"-o", oclust["results"],
		data_paths[0],
		data_paths[1],
		gene_transcripts_path
	])

	log.debug(cmd)

	ret_code = subprocess.call(cmd, shell=True)
	if ret_code == 1:
		log.warn("No results were generated")
	elif ret_code != 0:
		log.error("Error while executing OncodriveCLUST:\n{0}".format(cmd))

	projects_out_port.send(project)
Пример #24
0
def create_datasets(project):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	project_id = project["id"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_path = project["path"]
	temp_path = project["temp_path"]

	datasets_path = paths.project_results_path(project_path)
	ensure_path_exists(datasets_path)

	sigdb = SigDb(config.sigdb_path)
	sigdb.open()

	projdb = ProjectDb(project["db"])

	gene_sym = projdb.get_gene_symbols()

	total_samples = projdb.get_total_affected_samples()

	log.info("Exporting variant genes ...")

	vf = open_dataset(project_id, project_path, datasets_path, "variant_gene", "w", log)
	tsv.write_param(vf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(vf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE",
					"GENE_ID", "IMPACT", "IMPACT_CLASS",
					"SAMPLE_FREQ", "SAMPLE_PROP",
					"CODING_REGION", "PROTEIN_CHANGES", "INTOGEN_DRIVER", "XREFS")

	sf = open_dataset(project_id, project_path, datasets_path, "variant-samples", "w", log)
	tsv.write_line(sf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "SAMPLE")

	count = 0
	for afg in projdb.affected_genes(join_variant=True, join_samples=True, join_xrefs=True, join_rec=True):
		var = afg.var
		rec = afg.rec

		start, end, ref, alt = var_to_tab(var)

		allele = "{0}/{1}".format(ref, alt)

		xrefs = [xref for xref in var.xrefs]
		if sigdb.exists_variant(var.chr, start):
			xrefs += ["I:1"]
		xrefs = ",".join(xrefs)

		intogen_driver = 1 if sigdb.exists_gene(afg.gene_id) else 0

		tsv.write_line(vf, var.id, var.chr, var.strand, start, allele,
						afg.gene_id, afg.impact, TransFIC.class_name(afg.impact),
						rec.sample_freq, rec.sample_prop,
						afg.coding_region, afg.prot_changes, intogen_driver, xrefs, null_value="\N")

		for sample in var.samples:
			tsv.write_line(sf, var.id, var.chr, var.strand, start, allele, sample.name, null_value="\N")

		count += 1

	vf.close()
	sf.close()

	log.info("  {0} variant genes".format(count))

	log.info("Exporting consequences ...")

	cf = open_dataset(project_id, project_path, datasets_path, "consequence", "w", log)
	tsv.write_line(cf, "VAR_ID", "CHR", "STRAND", "START", "ALLELE", "TRANSCRIPT_ID",
				   "CT", "GENE_ID", "SYMBOL", "UNIPROT_ID", "PROTEIN_ID", "PROTEIN_POS", "AA_CHANGE",
					"SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS",
					"PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS",
					"MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS",
					"IMPACT", "IMPACT_CLASS")

	count = 0
	for csq in projdb.consequences(join_variant=True):
		var = csq.var
		start, end, ref, alt = var_to_tab(var)

		allele = "{0}/{1}".format(ref, alt)

		uniprot = protein = protein_pos = aa_change = None
		sift_score = sift_tfic = sift_tfic_class = None
		pph2_score = pph2_tfic = pph2_tfic_class = None
		ma_score = ma_tfic = ma_tfic_class = None

		if so.match(csq.ctypes, so.ONCODRIVEFM):
			uniprot, protein = csq.uniprot, csq.protein

		if so.match(csq.ctypes, so.NON_SYNONYMOUS):
			protein_pos, aa_change = csq.protein_pos, csq.aa_change
			sift_score, sift_tfic, sift_tfic_class = csq.sift_score, csq.sift_tfic, TransFIC.class_name(csq.sift_tfic_class)
			pph2_score, pph2_tfic, pph2_tfic_class = csq.pph2_score, csq.pph2_tfic, TransFIC.class_name(csq.pph2_tfic_class)
			ma_score, ma_tfic, ma_tfic_class = csq.ma_score, csq.ma_tfic, TransFIC.class_name(csq.ma_tfic_class)

		tsv.write_line(cf, var.id, var.chr, var.strand, start, allele, csq.transcript,
						",".join(csq.ctypes), csq.gene, gene_sym.get(csq.gene),
						uniprot, protein, protein_pos, aa_change,
						sift_score, sift_tfic, sift_tfic_class,
						pph2_score, pph2_tfic, pph2_tfic_class,
						ma_score, ma_tfic, ma_tfic_class,
						csq.impact, TransFIC.class_name(csq.impact), null_value="\N")
		count += 1

	cf.close()

	log.info("  {0} consequences".format(count))

	log.info("Exporting genes ...")

	gf = open_dataset(project_id, project_path, datasets_path, "gene", "w", log)
	tsv.write_param(gf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(gf, "GENE_ID", "FM_PVALUE", "FM_QVALUE", "FM_EXC_CAUSE",
				   "CLUST_ZSCORE", "CLUST_PVALUE", "CLUST_QVALUE", "CLUST_EXC_CAUSE", "CLUST_COORDS",
				   "SAMPLE_FREQ", "SAMPLE_PROP", "INTOGEN_DRIVER")


	for gene in projdb.genes(join_rec=True):
		rec = gene.rec

		if rec.sample_freq is None or rec.sample_freq == 0:
			continue

		intogen_driver = 1 if sigdb.exists_gene(gene.id) else 0

		tsv.write_line(gf, gene.id, gene.fm_pvalue, gene.fm_qvalue, gene.fm_exc_cause,
					   gene.clust_zscore, gene.clust_pvalue, gene.clust_qvalue, gene.clust_exc_cause, gene.clust_coords,
					   rec.sample_freq or 0, rec.sample_prop or 0,
					   intogen_driver, null_value="\N")

	gf.close()

	log.info("Exporting pathways ...")

	pf = open_dataset(project_id, project_path, datasets_path, "pathway", "w", log)
	tsv.write_param(pf, "SAMPLE_TOTAL", total_samples)
	tsv.write_line(pf, "PATHWAY_ID", "GENE_COUNT", "FM_ZSCORE", "FM_PVALUE", "FM_QVALUE",
				   "SAMPLE_FREQ", "SAMPLE_PROP", "GENE_FREQ", "GENE_TOTAL", "GENE_PROP")

	for pathway in projdb.pathways(join_rec=True):
		rec = pathway.rec

		if rec.sample_freq is None or rec.sample_freq == 0:
			continue

		tsv.write_line(pf, pathway.id, pathway.gene_count, pathway.fm_zscore, pathway.fm_pvalue, pathway.fm_qvalue,
						rec.sample_freq or 0, rec.sample_prop or 0, rec.gene_freq or 0, pathway.gene_count, rec.gene_prop or 0, null_value="\N")

	pf.close()

	if not config.skip_oncodrivefm:

		log.info("Exporting genes per sample functional impact ...")

		with open_dataset(project_id, project_path, datasets_path, "gene_sample-fimpact", "w", log) as f:
			tsv.write_line(f, "GENE_ID", "SAMPLE",
					   "SIFT_SCORE", "SIFT_TRANSFIC", "SIFT_TRANSFIC_CLASS",
					   "PPH2_SCORE", "PPH2_TRANSFIC", "PPH2_TRANSFIC_CLASS",
					   "MA_SCORE", "MA_TRANSFIC", "MA_TRANSFIC_CLASS")

			for fields in projdb.sample_gene_fimpacts():
				(gene, sample,
					sift_score, sift_tfic, sift_tfic_class,
					pph2_score, pph2_tfic, pph2_tfic_class,
					ma_score, ma_tfic, ma_tfic_class) = fields
				tsv.write_line(f, gene, sample,
						   sift_score, sift_tfic, TransFIC.class_name(sift_tfic_class),
						   pph2_score, pph2_tfic, TransFIC.class_name(pph2_tfic_class),
						   ma_score, ma_tfic, TransFIC.class_name(ma_tfic_class), null_value="\N")

	projdb.close()

	sigdb.close()

	log.info("Saving project configuration ...")

	projres = ProjectResults(project)

	with open_dataset(project_id, project_path, datasets_path, "project.tsv", "w", log) as f:
		names = ["ASSEMBLY", "SAMPLES_TOTAL"]
		values = [project["assembly"], total_samples]
		names, values = projres.get_annotations_to_save(config.project.annotations, project["annotations"], names=names, values=values)
		tsv.write_line(f, *names)
		tsv.write_line(f, *values, null_value="\N")

	projects_port = task.ports("projects_out")
	projects_port.send(project)
Пример #25
0
def prepare_files(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	config = GlobalConfig(conf)
	paths = PathsConfig(config) # avoid that project conf override path configurations
	config = GlobalConfig(conf, project["conf"])

	oclust = OncodriveClust(config.oncodriveclust, paths, log)

	project_results = ProjectResults(project)

	projdb = ProjectDb(project["db"])

	data = oclust.retrieve_data(projdb)

	projdb.close()

	data_paths = [
		os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"),
		os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")]

	log.info("Saving data ...")
	log.debug("> {0}".format(data_paths[NON_SYN]))
	log.debug("> {0}".format(data_paths[SYN]))

	df = [tsv.open(path, "w") for path in data_paths]

	gene_sample_count = defaultdict(int)

	for key, value in data.items():
		findex, gene, sample = key
		transcript, transcript_len, protein_pos = value

		if findex == NON_SYN:
			gene_sample_count[gene] += 1

			if oclust.filter_enabled and not oclust.filter.valid(gene):
				continue

		tsv.write_line(df[findex], gene, sample, protein_pos)

	for f in df:
		f.close()

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("Saving excluded gene causes ...")
	log.debug("> {0}".format(exc_path))

	with tsv.open(exc_path, "w") as exf:
		tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE")
		for gene, sample_count in gene_sample_count.items():
			causes = []
			if oclust.filter_enabled and not oclust.filter.valid(gene):
				causes += [ProjectDb.GENE_EXC_FILTER]
			if sample_count < oclust.samples_threshold:
				causes += [ProjectDb.GENE_EXC_THRESHOLD]
			if len(causes) > 0:
				tsv.write_line(exf, gene, "".join(causes))

	log.info("Sending project ...")

	projects_out_port.send(dict(project,
								oncodriveclust=dict(
									data_paths=data_paths,
									samples_threshold=oclust.samples_threshold)))
Пример #26
0
def prepare_files(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_results = ProjectResults(project)

	projdb = ProjectDb(project["db"])

	log.info("Retrieving functional impact scores for genes ...")

	data = retrieve_data(projdb)

	projdb.close()

	# save data matrix

	dst_path = os.path.join(project["temp_path"], "oncodrivefm-data.tdm")
	sgfi_path = os.path.join(project["temp_path"], "sample_gene-fimpact.tsv.gz")
	project["sample_gene_fi_data"] = sgfi_path

	log.info("Saving functional impact scores ...")
	log.debug("> {0}".format(dst_path))

	with open(dst_path, "w") as f:
		sgff = tsv.open(sgfi_path, "w")

		tsv.write_line(f, "SAMPLE", "GENE", "SIFT", "PPH2", "MA")
		tsv.write_line(sgff, "SAMPLE", "GENE",
					   "SIFT_SCORE", "SIFT_TFIC", "SIFT_TFIC_CLASS",
					   "PPH2_SCORE", "PPH2_TFIC", "PPH2_TFIC_CLASS",
					   "MA_SCORE", "MA_TFIC", "MA_TFIC_CLASS")

		for key, values in data.iteritems():
			sample, gene = key

			(sift_score, sift_tfic, sift_tfic_class,
				pph2_score, pph2_tfic, pph2_tfic_class,
				ma_score, ma_tfic, ma_tfic_class) = values

			tsv.write_line(f, sample, gene, sift_score, pph2_score, ma_score)
			tsv.write_line(sgff, sample, gene,
						   sift_score, sift_tfic, sift_tfic_class,
						   pph2_score, pph2_tfic, pph2_tfic_class,
						   ma_score, ma_tfic, ma_tfic_class, null_value="-")

		sgff.close()

	# count samples

	samples = set()
	gene_sample_count = {}
	for sample, gene in data.keys():
		samples.add(sample)
		if gene not in gene_sample_count:
			gene_sample_count[gene] = 1
		else:
			gene_sample_count[gene] += 1

	num_samples = len(samples)

	if num_samples == 0:
		log.warn("There are no samples data, skipping OncodriveFM for this project")
		return

	(num_cores, estimator,
		genes_num_samplings, genes_threshold, genes_filter_enabled, genes_filter, filt,
		pathways_num_samplings, pathways_threshold) = get_oncodrivefm_configuration(log, conf, project, num_samples)

	# Create a dataset with information on why some genes are not considered for calculation in OncodriveFM
	# There are basically two possible reasons:
	# - It does not pass the filter
	# - There are less samples mutated than the threshold

	exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv")

	log.info("Saving excluded gene causes ...")
	log.debug("> {0}".format(exc_path))

	with tsv.open(exc_path, "w") as exf:
		tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE")
		for gene, sample_count in gene_sample_count.items():
			causes = []
			if genes_filter_enabled and not filt.valid(gene):
				causes += [ProjectDb.GENE_EXC_FILTER]
			if sample_count < genes_threshold:
				causes += [ProjectDb.GENE_EXC_THRESHOLD]
			if len(causes) > 0:
				tsv.write_line(exf, gene, "".join(causes))

	ofm = dict(
		data=dst_path,
		num_cores=num_cores,
		estimator=estimator)

	for slice_name in ["SIFT", "PPH2", "MA"]:
		projects_out_port.send(dict(project,
			oncodrivefm=dict(ofm,
				 feature="genes",
				 slice=slice_name,
				 num_samplings=genes_num_samplings,
				 threshold=genes_threshold,
				 filter_enabled=genes_filter_enabled,
				 filter=genes_filter)))

	for slice_name in ["SIFT", "PPH2", "MA"]:
		projects_out_port.send(dict(project,
			oncodrivefm=dict(ofm,
				 feature="pathways",
				 slice=slice_name,
				 num_samplings=pathways_num_samplings,
				 threshold=pathways_threshold,
				 filter_enabled=genes_filter_enabled,
				 filter=genes_filter)))
Пример #27
0
def liftover(project):
    log = task.logger
    conf = task.conf

    config = GlobalConfig(conf)

    lifted_project_port = task.ports("lifted_projects")

    log.info("--- [{0}] --------------------------------------------".format(project["id"]))

    log.info("Preparing liftOver files ...")

    in_path = make_temp_file(task, suffix=".bed")
    in_file = open(in_path, "w")
    out_path = make_temp_file(task, suffix=".bed")
    unmapped_path = os.path.join(project["temp_path"], "liftover_unmapped.bed")

    projdb = ProjectDb(project["db"])

    for var in projdb.variants(order_by="position"):
        in_file.write(tsv.line_text("chr" + var.chr, var.start, var.start + len(var.ref), var.id))

    in_file.close()

    log.info("Running liftOver ...")

    project["from_assembly"] = project["assembly"]
    project["assembly"] = "hg19"

    cmd = " ".join(
        [
            conf["liftover_bin"],
            in_path,
            os.path.join(conf["liftover_chain_path"], "hg18ToHg19.over.chain"),
            out_path,
            unmapped_path,
        ]
    )

    log.debug(cmd)

    subprocess.call(cmd, shell=True)

    log.info("Annotating unmapped variants ...")

    count = 0
    with open(unmapped_path, "r") as f:
        for line in f:
            if line.lstrip().startswith("#"):
                continue
            fields = line.rstrip().split("\t")
            var_id = int(fields[3])
            projdb.update_variant_start(var_id, start=None)
            count += 1

    log.info("  {0} unmapped variants annotated".format(count))

    log.info("Updating variants ...")

    count = 0
    with open(out_path, "r") as f:
        for line in f:
            fields = line.rstrip().split("\t")
            chr, start, end, var_id = fields
            projdb.update_variant_start(var_id, start=start)
            count += 1

    log.info("  {0} variants".format(count))

    remove_temp(task, in_path, out_path)

    projdb.commit()
    projdb.close()

    lifted_project_port.send(project)
Пример #28
0
def prepare_files(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_results = ProjectResults(project)

	mutations_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project)

	log.info("Loading transcripts CDS length ...")

	cds_len = load_cds_len(conf)

	log.info("Retrieving gene alterations ...")

	projdb = ProjectDb(project["db"])

	data = retrieve_data(projdb, cds_len)

	projdb.close()

	data_paths = [
		os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"),
		os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")]

	log.info("Saving data ...")
	log.debug("> {0}".format(data_paths[NON_SYN]))
	log.debug("> {0}".format(data_paths[SYN]))

	df = [tsv.open(path, "w") for path in data_paths]

	gene_sample_count = {}

	for key, value in data.items():
		findex, gene, sample = key
		transcript, transcript_len, protein_pos = value

		if findex == NON_SYN:
			if gene not in gene_sample_count:
				gene_sample_count[gene] = 1
			else:
				gene_sample_count[gene] += 1

			if genes_filter_enabled and not filt.valid(gene):
				continue

		tsv.write_line(df[findex], gene, sample, protein_pos)

	for f in df:
		f.close()

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("Saving excluded gene causes ...")
	log.debug("> {0}".format(exc_path))

	with tsv.open(exc_path, "w") as exf:
		tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE")
		for gene, sample_count in gene_sample_count.items():
			causes = []
			if genes_filter_enabled and not filt.valid(gene):
				causes += [ProjectDb.GENE_EXC_FILTER]
			if sample_count < mutations_threshold:
				causes += [ProjectDb.GENE_EXC_THRESHOLD]
			if len(causes) > 0:
				tsv.write_line(exf, gene, "".join(causes))

	log.info("Sending project ...")

	projects_out_port.send(dict(project,
								oncodriveclust=dict(
									data_paths=data_paths,
									mutations_threshold=mutations_threshold,
									genes_filter_enabled=genes_filter_enabled, # not used
									genes_filter=genes_filter))) # not used
Пример #29
0
def end():
	log = task.logger

	projects_out_port = task.ports("projects_out")

	log.info("Updating the projects database ...")

	for project_id, projects in task.context.items():

		log.info("[{0}]".format(project_id))

		for index, project in enumerate(projects):
			projdb = ProjectDb(project["db"])

			if index == 0:
				log.info("  Functional impact ...")

				projdb.delete_sample_gene_fimpact()

				with tsv.open(project["sample_gene_fi_data"], "r") as f:
					types = (int, str, float, float, int, float, float, int, float, float, int)
					for fields in tsv.lines(f, types, header=True, null_value="-"):
						projdb.add_sample_gene_fimpact(*fields)

			ofm = project["oncodrivefm"]
			del project["oncodrivefm"]

			exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv")

			log.info("  Excluded gene causes ...")
			log.debug("    > {0}".format(exc_path))

			count = 0
			with tsv.open(exc_path, "r") as exf:
				for gene, cause in tsv.lines(exf, (str, str), header=True):
					projdb.update_gene(Gene(id=gene, fm_exc_cause=cause))
					count += 1

			log.debug("    {0} genes excluded".format(count))

			for feature, results_path in ofm:

				log.info("  {0} ...".format(feature))
				log.debug("    > {0}".format(results_path))

				if feature == "genes":
					with tsv.open(results_path, "r") as f:
						count = 0
						for gene, pvalue, qvalue in tsv.lines(f, (str, float, float), header=True):
							projdb.update_gene(Gene(id=gene, fm_pvalue=pvalue,
													fm_qvalue=qvalue, fm_exc_cause=ProjectDb.NO_GENE_EXC))
							count += 1
						log.info("    {0} genes".format(count))
				elif feature == "pathways":
					with tsv.open(results_path, "r") as f:
						count = 0
						for pathway, zscore, pvalue, qvalue in tsv.lines(f, (str, float, float, float), header=True):
							projdb.update_pathway(Pathway(id=pathway, fm_zscore=zscore,
														  fm_pvalue=pvalue, fm_qvalue=qvalue))
							count += 1
						log.info("    {0} pathways".format(count))

			projdb.commit()

			projdb.close()

		projects_out_port.send(projects[0])
Пример #30
0
def scan_files(project):
	log = task.logger
	conf = task.conf

	config = GlobalConfig(conf)
	paths = PathsConfig(config)

	projects_port, liftover_projects_port = task.ports("projects_out", "liftover_projects")

	project_id = project["id"]
	temp_path = project["temp_path"]
	project_path = project["path"]
	projdb_path = project["db"]
	assembly = project["assembly"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	if assembly == "hg18":
		out_port = liftover_projects_port
	elif assembly == "hg19":
		out_port = projects_port
	else:
		raise Exception("Unexpected assembly: {0}".format(assembly))

	#if os.path.exists(projdb_path):
	#	log.warn("Variations database already created, skipping this step.")
	#	out_port.send(project)
	#	return

	if os.path.exists(projdb_path):
		os.remove(projdb_path)

	log.info("Creating variants database ...")

	projdb_tmp_path = make_temp_file(task, suffix=".db")

	log.debug(projdb_tmp_path)

	projdb = ProjectDb(projdb_tmp_path).create()

	data_path = config.data_path

	log.info("Loading genes ...")

	projdb.load_genes(paths.data_ensembl_genes_path())

	log.info("Loading pathways ...")

	projdb.load_pathways(
		paths.data_kegg_def_path(),
		paths.data_kegg_ensg_map_path())

	log.info("Parsing variants ...")

	for obj_name in project["storage_objects"]:
		log.info("Downloading {} ...".format(obj_name))
		dst_path = os.path.join(project_path, "sources", os.path.basename(obj_name))
		dst_dirname = os.path.dirname(dst_path)
		if not os.path.exists(dst_dirname):
			os.makedirs(dst_dirname)
		# TODO: do not copy the source file (do not specify dst_path)
		task.storage.get_object(obj_name).get_data(dst_path)

		for container_name, path, name, ext, f in archived_files(dst_path):
			fname = os.path.join(path, name + ext)
			if container_name is not None:
				source_name = "{0}:{1}".format(os.path.basename(container_name), fname)
			else:
				source_name = name + ext

			log.info("=> {0} ...".format(source_name))

			sample_id = os.path.basename(name)

			if ext.lower() in _SUPPORTED_EXTENSIONS:
				parser_type = ext[1:]
			else:
				parser_type = "tab"

			parser = create_variants_parser(parser_type, f, source_name, sample_id)

			source_id = projdb.add_source(source_name)

			var_ids = set()
			for var in parser:
				for line_num, text in parser.read_lines():
					projdb.add_source_line(source_id, line_num, text)

				var_id = projdb.add_variant(var, source_id=source_id, line_num=parser.get_line_num())
				var_ids.add(var_id)

			for line_num, text in parser.read_lines():
				projdb.add_source_line(source_id, line_num, text)

			num_variants = len(var_ids)
			log.info("   {0} variants".format(num_variants))

			if num_variants == 0:
				raise Exception("No variants found in source '{}'. "
								"Please check the documentation for the expected input for '{}' format.".format(
								source_name, parser.name))

	projdb.commit()
	projdb.close()

	log.info("Copying variants database ...")

	log.debug("{0} -> {1}".format(projdb_tmp_path, projdb_path))

	shutil.copy(projdb_tmp_path, projdb_path)

	remove_temp(task, projdb_tmp_path)

	out_port.send(project)
Пример #31
0
def compute(project):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	ofm = Data.element(project["oncodrivefm"])

	feature = ofm["feature"]
	slice_name = ofm["slice"]
	estimator = ofm.get("estimator")
	num_cores = ofm.get("num_cores", dtype=str)
	num_samplings = ofm.get("num_samplings", dtype=str)
	samples_threshold = ofm.get("samples_threshold", dtype=str)
	filter_enabled = ofm.get("filter_enabled", dtype=bool)
	filter_path = ofm.get("filter_path", dtype=str)

	log.info("feature = {0}".format(feature))
	log.info("slice = {0}".format(slice_name))
	log.info("estimator = {0}".format(estimator))
	log.info("num_cores = {0}".format(num_cores))
	log.info("num_samplings = {0}".format(num_samplings))
	log.info("samples_threshold = {0}".format(samples_threshold))
	log.info("filter_enabled = {0}".format(filter_enabled))
	log.info("filter_path = {0}".format(os.path.basename(filter_path)))

	cmd = [
		"oncodrivefm-compute",
		"-o", project["temp_path"],
		"-n oncodrivefm-{0}".format(feature),
		"-N", num_samplings,
		"--threshold", samples_threshold,
		"-e {0}".format(estimator),
		"-j", num_cores,
		"--slices '{0}'".format(slice_name)]

	if filter_enabled:
		cmd += ["--filter", filter_path]

	if feature == "pathways":
		cmd += ["-m", paths.data_kegg_path("ensg_kegg.tsv")]

	cmd += [ofm["data"]]

	project["oncodrivefm"] = dict(
		feature=feature,
		slice=slice_name,
		results=os.path.join(project["temp_path"], "oncodrivefm-{0}-{1}.tsv".format(feature, slice_name)))

	cmd = " ".join(cmd)

	log.debug(cmd)

	ret_code = subprocess.call(cmd, shell=True)
	if ret_code != 0:
		raise Exception("OncodriveFM error while computing {0}:\n{1}".format(feature, cmd))

	projects_out_port.send(project)
Пример #32
0
def gene_impact(project):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	projects_port = task.ports("projects")

	log.info("--- [{0}] --------------------------------------------".format(project["id"]))

	partitions = project["partitions"]

	log.info("Reading {} partitions ...".format(len(partitions)))

	aff_gene_attrs = {}

	for partition in partitions:
		log.info(" Partition {} ...".format(partition["index"]))
		with open(partition["tfi_path"], "r") as f:
			bool_type = lambda val: bool(int(val)) if val is not None else False
			types = (int, str, str, bool_type, int, int, int, int)
			columns = [0, 2, 4, 5, 6, 10, 14, 18]
			for fields in tsv.lines(f, types, columns=columns, null_value="-"):
				(var_id, gene, prot_change, coding_region, tr_impact,
				 	sift_impact, pph2_impact, ma_impact) = fields

				coding_region = coding_region == 1

				aff_gene = (var_id, gene)

				# update aggregated impact for all the predictors
				update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact)
				update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact)
				update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact)

				# update whether the affected gene is a coding region or not
				update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region,
							update=lambda prev_value, value: prev_value or value)

				# aggregate protein changes per affected_gene
				if prot_change is not None:
					update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change,
									 new=lambda value: set([value]),
									 update=lambda prev_value, value: prev_value | set([value]))

	num_vars = len(set([var_id for var_id, gene in aff_gene_attrs.keys()]))
	num_genes = len(set([gene for var_id, gene in aff_gene_attrs.keys()]))
	log.info("Saving {} variant-gene impacts ({} variants and {} genes) ...".format(len(aff_gene_attrs), num_vars, num_genes))

	gfi_path = os.path.join(project["csq_path"], "variant-gene_impact.tsv")
	with open(gfi_path, "w") as vf:
		for aff_gene, attrs in aff_gene_attrs.items():
			var_id, gene = aff_gene

			# get the impact by trust priority: ma, pph2, sift
			impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS

			coding_region = attrs.get("coding_region", False)
			coding_region = 1 if coding_region else 0

			prot_changes = attrs.get("prot_changes")
			prot_changes = ",".join(prot_changes) if prot_changes is not None else None

			tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-")

	# Send results to the next module
	project["gfi_path"] = gfi_path
	projects_port.send(project)
Пример #33
0
def fimpact_run(partition):
	log = task.logger
	conf = task.conf

	results_port = task.ports("results")

	project = partition["project"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"]))

	log.info("Reading MA scores ...")

	ma_uniprot = {}
	ma_scores = {}

	with open(partition["ma_path"], "r") as f:
		for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"):
			ma_uniprot[var_id] = uniprot
			ma_scores[var_id] = fi_score

	log.info("Reading VEP results and calculating functional impact ...")

	tfic = TransFIC(data_path=os.path.join(conf["data_path"], "TransFIC"))

	tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"]))
	cf = open(tfi_path, "w")

	aff_gene_attrs = {}

	with open(partition["vep_path"], "r") as f:
		for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"):
			(var_id, gene, transcript, ct,
				protein_pos, aa_change, protein,
				sift_score, pph2_score) = fields

			if ct is not None:
				ct = ct.split(",")
			else:
				ct = []

			# Invert sift score
			if sift_score is not None:
				sift_score = 1.0 - sift_score

			ma_score = None

			uniprot = ma_uniprot[var_id] if var_id in ma_uniprot else None

			sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS

			coding_region = so.match(ct, so.CODING_REGION)

			calculate_transfic = True

			ct_type = None
			if so.match(ct, so.NON_SYNONYMOUS):       # missense
				ct_type = TransFIC.CT_NON_SYNONYMOUS
				ma_score = ma_scores[var_id] if var_id in ma_scores else None
			elif so.match(ct, so.STOP):               # stop
				ct_type = TransFIC.CT_STOP
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.FRAMESHIFT):         # frameshift
				ct_type = TransFIC.CT_FRAMESHIFT
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE):             # splice
				ct_type = "splice"
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS if so.match(ct, so.SPLICE_JUNCTION) else TransFIC.UNKNOWN_IMPACT_CLASS
				calculate_transfic = False
			elif so.match(ct, so.SYNONYMOUS):         # synonymous
				ct_type = TransFIC.CT_SYNONYMOUS
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				sift_score = pph2_score = 0.0
				ma_score = -2
			else:
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				calculate_transfic = False

			if calculate_transfic:
				(sift_tfic, sift_class,
				 pph2_tfic, pph2_class,
				 ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score)

				# if the impact was not preassigned get it from the transFIC calculated class
				sift_impact = sift_class if sift_impact is None and sift_class in IMPACT_CLASSES else sift_impact
				pph2_impact = pph2_class if pph2_impact is None and pph2_class in IMPACT_CLASSES else pph2_impact
				ma_impact = ma_class if ma_impact is None and ma_class in IMPACT_CLASSES else ma_impact
			else:
				sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None)

			aff_gene = (var_id, gene)

			# update aggregated impact for all the predictors
			update_attr(aff_gene_attrs, aff_gene, "sift_impact", sift_impact, update=TransFIC.higher_impact)
			update_attr(aff_gene_attrs, aff_gene, "pph2_impact", pph2_impact, update=TransFIC.higher_impact)
			update_attr(aff_gene_attrs, aff_gene, "ma_impact", ma_impact, update=TransFIC.higher_impact)

			# update whether the affected gene is a coding region or not
			update_attr(aff_gene_attrs, aff_gene, "coding_region", coding_region,
						update=lambda prev_value, value: prev_value or value)

			# aggregate protein changes per affected_gene
			# try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html
			prot_change = None
			if ct_type == TransFIC.CT_FRAMESHIFT:
				if protein_pos is None:
					prot_change = "fs"
				else:
					prot_change = "fs {0}".format(protein_pos)
				#log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif ct_type == "splice":
				prot_change = "r.spl?"
				#log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif protein_pos is not None and aa_change is not None:
				rc = ReContext()
				if rc.match(SIMPLE_AA_CHANGE_RE, aa_change):
					prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=")
				elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change):
					prot_change = "{0} {1}".format(aa_change, protein_pos)
				else:
					log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format(
													gene, protein, protein_pos, aa_change, ", ".join(ct)))

			if prot_change is not None:
				update_attr(aff_gene_attrs, aff_gene, "prot_changes", prot_change,
								 new=lambda value: set([value]),
								 update=lambda prev_value, value: prev_value | set([value]))

			impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS

			tsv.write_line(cf, var_id, transcript, uniprot,
						   sift_score, sift_tfic, sift_class,
						   pph2_score, pph2_tfic, pph2_class,
						   ma_score, ma_tfic, ma_class,
						   impact, null_value="-")

	cf.close()

	log.info("Saving variant impacts ...")

	gfi_path = os.path.join(partition["base_path"], "{0:08d}.gfi".format(partition["index"]))
	vf = open(gfi_path, "w")
	for aff_gene, attrs in aff_gene_attrs.items():
		var_id, gene = aff_gene
		# get the impact by trust priority: ma, pph2, sift
		impact = attrs.get("ma_impact") or attrs.get("pph2_impact") or attrs.get("sift_impact") or TransFIC.UNKNOWN_IMPACT_CLASS
		coding_region = attrs.get("coding_region", False)
		coding_region = 1 if coding_region else 0
		prot_changes = attrs.get("prot_changes")
		prot_changes = ",".join(prot_changes) if prot_changes is not None else None

		tsv.write_line(vf, var_id, gene, impact, coding_region, prot_changes, null_value="-")
	vf.close()

	# Send results to the next module
	partition["tfi_path"] = tfi_path
	partition["gfi_path"] = gfi_path
	results_port.send(partition)
Пример #34
0
def fimpact_run(partition):
	log = task.logger

	config = GlobalConfig(task.conf)
	paths = PathsConfig(config)

	results_port = task.ports("results")

	project = partition["project"]

	log.info("--- [{0} @ {1}] --------------------------------------------".format(project["id"], partition["index"]))

	log.info("Reading MA scores ...")

	ma_uniprot = {}
	ma_scores = {}

	with open(partition["ma_path"], "r") as f:
		for var_id, uniprot, fi_score in tsv.lines(f, (int, str, float), null_value="-"):
			ma_uniprot[var_id] = uniprot
			ma_scores[var_id] = fi_score

	log.info("Reading VEP results and calculating functional impact ...")

	tfic = TransFIC(data_path=paths.data_transfic_path())

	tfi_path = os.path.join(partition["base_path"], "{0:08d}.tfi".format(partition["index"]))
	cf = open(tfi_path, "w")

	with open(partition["vep_path"], "r") as f:
		for fields in tsv.lines(f, (int, str, str, str, str, str, str, float, float), null_value="-"):
			(var_id, gene, transcript, ct,
				protein_pos, aa_change, protein,
				sift_score, pph2_score) = fields

			ct = (ct or "").split(",")

			# Invert sift score
			if sift_score is not None:
				sift_score = 1.0 - sift_score

			ma_score = None

			uniprot = ma_uniprot.get(var_id)

			sift_impact = pph2_impact = ma_impact = None # TransFIC.UNKNOWN_IMPACT_CLASS

			coding_region = 1 if so.match(ct, so.CODING_REGION) else 0

			sift_tfic, sift_class, pph2_tfic, pph2_class, ma_tfic, ma_class = (None, None, None, None, None, None)

			ct_type = None
			if so.match(ct, so.NON_SYNONYMOUS):       # missense
				ct_type = TransFIC.CT_NON_SYNONYMOUS
				ma_score = ma_scores.get(var_id)

				(sift_tfic, sift_class,
				 pph2_tfic, pph2_class,
				 ma_tfic, ma_class) = tfic.calculate("gosmf", gene, ct_type, sift_score, pph2_score, ma_score)

				sift_impact = sift_class if sift_class in IMPACT_CLASSES else sift_impact
				pph2_impact = pph2_class if pph2_class in IMPACT_CLASSES else pph2_impact
				ma_impact = ma_class if ma_class in IMPACT_CLASSES else ma_impact
			elif so.match(ct, so.STOP):               # stop
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.FRAMESHIFT):         # frameshift
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE_JUNCTION):    # splice junction
				sift_impact = pph2_impact = ma_impact = TransFIC.HIGH_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SPLICE_REGION):      # splice region
				sift_impact = pph2_impact = ma_impact = TransFIC.UNKNOWN_IMPACT_CLASS
				sift_score = pph2_score = 1.0
				ma_score = 3.5
			elif so.match(ct, so.SYNONYMOUS):         # synonymous
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS
				sift_score = pph2_score = 0.0
				ma_score = -2
			else:
				sift_impact = pph2_impact = ma_impact = TransFIC.NONE_IMPACT_CLASS

			aff_gene = (var_id, gene)

			# try to follow the convention http://www.hgvs.org/mutnomen/recs-prot.html
			prot_change = None
			if ct_type == TransFIC.CT_FRAMESHIFT:
				if protein_pos is None:
					prot_change = "fs"
				else:
					prot_change = "fs {0}".format(protein_pos)
				#log.debug("FRAMESHIFT: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif ct_type == "splice":
				prot_change = "r.spl?"
				#log.debug("SPLICE: gene={}, protein={}, pos={}, change={}".format(gene, protein, protein_pos, aa_change))
			elif protein_pos is not None and aa_change is not None:
				rc = ReContext()
				if rc.match(SIMPLE_AA_CHANGE_RE, aa_change):
					prot_change = "{ref}{pos}{alt}".format(pos=protein_pos, ref=rc.group(1), alt=rc.group(2) or "=")
				elif rc.match(COMPLEX_AA_CHANGE_RE, aa_change):
					prot_change = "{0} {1}".format(aa_change, protein_pos)
				else:
					log.warn("Unmatched aa change: gene={}, protein={}, pos={}, change={}, ct=[{}]".format(
													gene, protein, protein_pos, aa_change, ", ".join(ct)))

			tr_impact = ma_impact or pph2_impact or sift_impact or TransFIC.UNKNOWN_IMPACT_CLASS

			tsv.write_line(cf, var_id, transcript, gene, uniprot, prot_change, coding_region, tr_impact,
						   sift_score, sift_tfic, sift_class, sift_impact,
						   pph2_score, pph2_tfic, pph2_class, pph2_impact,
						   ma_score, ma_tfic, ma_class, ma_impact,
						   null_value="-")

	cf.close()

	# Send results to the next module
	partition["tfi_path"] = tfi_path
	results_port.send(partition)