예제 #1
0
def projects(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	log.info("--- [{0}] --------------------------------------------".format(project["id"]))

	projdb = ProjectDb(project["db"])

	total_samples = projdb.get_total_affected_samples()

	if total_samples == 0:
		log.warn("There are no samples, recurrences cannot be calculated.")
		projdb.close()
		return

	log.info("Calculating project recurrences for variant genes ...")

	projdb.compute_affected_genes_recurrences(total_samples)

	if not conf.get("variants_only", False):

		log.info("Calculating project recurrences for genes ...")

		projdb.compute_gene_recurrences(total_samples)

		log.info("Calculating project recurrences for pathways ...")

		projdb.compute_pathway_recurrences(total_samples)

	projdb.commit()
	projdb.close()

	projects_out_port.send(project)
예제 #2
0
def update_db(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	oclust = project["oncodriveclust"]
	del project["oncodriveclust"]

	if not os.path.exists(oclust["results"]):
		log.warn("No results have been found. Skipping it.")
		return

	log.info("Updating the project database ...")

	projdb = ProjectDb(project["db"])

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("  Excluded gene causes ...")
	log.debug("    > {0}".format(exc_path))

	count = 0
	with tsv.open(exc_path, "r") as exf:
		for gene, cause in tsv.lines(exf, (str, str), header=True):
			projdb.update_gene(Gene(id=gene, clust_exc_cause=cause))
			count += 1

	log.debug("    {0} genes excluded".format(count))

	log.info("  OncodriveCLUST results ...")

	with tsv.open(oclust["results"], "r") as f:
		types = (str, str, float, float, float)
		columns = ("GENE", "CLUST_COORDS", "ZSCORE", "PVALUE", "QVALUE")
		for gene, coords, zscore, pvalue, qvalue in tsv.lines(f, types, columns=columns, header=True, null_value="NA"):
			projdb.update_gene(Gene(id=gene, clust_coords=coords, clust_zscore=zscore, clust_pvalue=pvalue,
									clust_qvalue=qvalue, clust_exc_cause=ProjectDb.NO_GENE_EXC))

	projdb.commit()

	projdb.close()

	projects_out_port.send(project)
예제 #3
0
def scan_files(project):
	log = task.logger
	conf = task.conf

	projects_port, liftover_projects_port = task.ports("projects_out", "liftover_projects")

	project_id = project["id"]
	temp_path = project["temp_path"]
	project_path = project["path"]
	projdb_path = project["db"]
	assembly = project["assembly"]

	log.info("--- [{0}] --------------------------------------------".format(project_id))

	if assembly == "hg18":
		out_port = liftover_projects_port
	elif assembly == "hg19":
		out_port = projects_port
	else:
		raise Exception("Unexpected assembly: {0}".format(assembly))

	#if os.path.exists(projdb_path):
	#	log.warn("Variations database already created, skipping this step.")
	#	out_port.send(project)
	#	return

	if os.path.exists(projdb_path):
		os.remove(projdb_path)

	log.info("Creating variants database ...")

	projdb_tmp_path = make_temp_file(task, suffix=".db")

	log.debug(projdb_tmp_path)

	projdb = ProjectDb(projdb_tmp_path)
	projdb.create()

	data_path = conf["data_path"]

	log.info("Loading genes ...")

	projdb.load_genes(get_data_ensembl_genes_path(conf))

	log.info("Loading pathways ...")

	projdb.load_pathways(
		get_data_kegg_def_path(conf),
		get_data_kegg_ensg_map_path(conf))

	log.info("Parsing variants ...")

	for file in project["files"]:
		if not os.path.isabs(file):
			raise InternalError("Non absolute path found: {0}".format(file))

		if not os.path.exists(file):
			raise Exception("Input file not found: {0}".format(file))

		if not os.path.isfile(file):
			raise Exception("Not a file: {0}".format(file))

		for container_name, path, name, ext, f in archived_files(file):
			fname = os.path.join(path, name + ext)
			if container_name is not None:
				source_name = "{0}:{1}".format(os.path.basename(container_name), fname)
			else:
				source_name = name + ext

			log.info("=> {0} ...".format(source_name))

			sample_id = os.path.basename(name)

			if ext.lower() in _SUPPORTED_EXTENSIONS:
				parser_type = ext[1:]
			else:
				parser_type = "tab"

			parser = create_variants_parser(parser_type, f, source_name, sample_id)

			source_id = projdb.add_source(source_name)

			var_ids = set()
			for var in parser:
				for line_num, text in parser.read_lines():
					projdb.add_source_line(source_id, line_num, text)

				var_id = projdb.add_variant(var, source_id=source_id, line_num=parser.get_line_num())
				var_ids.add(var_id)

			for line_num, text in parser.read_lines():
				projdb.add_source_line(source_id, line_num, text)

			num_variants = len(var_ids)
			log.info("   {0} variants".format(num_variants))

			if num_variants == 0:
				raise Exception("No variants found in source '{}'. "
								"Please check the documentation for the expected input for '{}' format.".format(
								source_name, parser.name))

	projdb.commit()
	projdb.close()

	log.info("Copying variants database ...")

	log.debug("{0} -> {1}".format(projdb_tmp_path, projdb_path))

	shutil.copy(projdb_tmp_path, projdb_path)

	remove_temp(task, projdb_tmp_path)

	out_port.send(project)
예제 #4
0
def end():
    log = task.logger

    projects_out_port = task.ports("projects_out")

    log.info("Updating the projects database ...")

    for project_id, projects in task.context.items():

        log.info("[{0}]".format(project_id))

        for index, project in enumerate(projects):
            projdb = ProjectDb(project["db"])

            if index == 0:
                log.info("  Functional impact ...")

                projdb.delete_sample_gene_fimpact()

                with tsv.open(project["sample_gene_fi_data"], "r") as f:
                    types = (int, str, float, float, int, float, float, int, float, float, int)
                    for fields in tsv.lines(f, types, header=True, null_value="-"):
                        projdb.add_sample_gene_fimpact(*fields)

            ofm = project["oncodrivefm"]
            del project["oncodrivefm"]

            exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv")

            log.info("  Excluded gene causes ...")
            log.debug("    > {0}".format(exc_path))

            count = 0
            with tsv.open(exc_path, "r") as exf:
                for gene, cause in tsv.lines(exf, (str, str), header=True):
                    projdb.update_gene(Gene(id=gene, fm_exc_cause=cause))
                    count += 1

            log.debug("    {0} genes excluded".format(count))

            for feature, results_path in ofm:

                log.info("  {0} ...".format(feature))
                log.debug("    > {0}".format(results_path))

                if feature == "genes":
                    with tsv.open(results_path, "r") as f:
                        count = 0
                        for gene, pvalue, qvalue in tsv.lines(f, (str, float, float), header=True):
                            projdb.update_gene(
                                Gene(id=gene, fm_pvalue=pvalue, fm_qvalue=qvalue, fm_exc_cause=ProjectDb.NO_GENE_EXC)
                            )
                            count += 1
                        log.info("    {0} genes".format(count))
                elif feature == "pathways":
                    with tsv.open(results_path, "r") as f:
                        count = 0
                        for pathway, zscore, pvalue, qvalue in tsv.lines(f, (str, float, float, float), header=True):
                            projdb.update_pathway(
                                Pathway(id=pathway, fm_zscore=zscore, fm_pvalue=pvalue, fm_qvalue=qvalue)
                            )
                            count += 1
                        log.info("    {0} pathways".format(count))

            projdb.commit()

            projdb.close()

        projects_out_port.send(projects[0])
예제 #5
0
def liftover(project):
	log = task.logger
	conf = task.conf

	lifted_project_port = task.ports("lifted_projects")

	log.info("--- [{0}] --------------------------------------------".format(project["id"]))

	log.info("Preparing liftOver files ...")

	in_path = make_temp_file(task, suffix=".bed")
	in_file = open(in_path, "w")
	out_path = make_temp_file(task, suffix=".bed")
	unmapped_path = os.path.join(project["temp_path"], "liftover_unmapped.bed")

	projdb = ProjectDb(project["db"])

	for var in projdb.variants(order_by="position"):
		in_file.write(tsv.line_text("chr" + var.chr, var.start, var.start + len(var.ref), var.id))

	in_file.close()

	log.info("Running liftOver ...")

	project["from_assembly"] = project["assembly"]
	project["assembly"] = "hg19"

	cmd = " ".join([
		conf["liftover_bin"],
		in_path,
		os.path.join(conf["liftover_chain_path"], "hg18ToHg19.over.chain"),
		out_path,
		unmapped_path
	])

	log.debug(cmd)

	subprocess.call(cmd, shell=True)

	log.info("Annotating unmapped variants ...")

	count = 0
	with open(unmapped_path, "r") as f:
		for line in f:
			if line.lstrip().startswith("#"):
				continue
			fields = line.rstrip().split("\t")
			var_id = int(fields[3])
			projdb.update_variant_start(var_id, start=None)
			count += 1

	log.info("  {0} unmapped variants annotated".format(count))

	log.info("Updating variants ...")

	count = 0
	with open(out_path, "r") as f:
		for line in f:
			fields = line.rstrip().split("\t")
			chr, start, end, var_id = fields
			projdb.update_variant_start(var_id, start=start)
			count += 1

	log.info("  {0} variants".format(count))

	remove_temp(task, in_path, out_path)

	projdb.commit()
	projdb.close()

	lifted_project_port.send(project)