def projects(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") log.info("--- [{0}] --------------------------------------------".format(project["id"])) projdb = ProjectDb(project["db"]) total_samples = projdb.get_total_affected_samples() if total_samples == 0: log.warn("There are no samples, recurrences cannot be calculated.") projdb.close() return log.info("Calculating project recurrences for variant genes ...") projdb.compute_affected_genes_recurrences(total_samples) if not conf.get("variants_only", False): log.info("Calculating project recurrences for genes ...") projdb.compute_gene_recurrences(total_samples) log.info("Calculating project recurrences for pathways ...") projdb.compute_pathway_recurrences(total_samples) projdb.commit() projdb.close() projects_out_port.send(project)
def update_db(project): log = task.logger conf = task.conf projects_out_port = task.ports("projects_out") project_id = project["id"] log.info("--- [{0}] --------------------------------------------".format(project_id)) oclust = project["oncodriveclust"] del project["oncodriveclust"] if not os.path.exists(oclust["results"]): log.warn("No results have been found. Skipping it.") return log.info("Updating the project database ...") projdb = ProjectDb(project["db"]) exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv") log.info(" Excluded gene causes ...") log.debug(" > {0}".format(exc_path)) count = 0 with tsv.open(exc_path, "r") as exf: for gene, cause in tsv.lines(exf, (str, str), header=True): projdb.update_gene(Gene(id=gene, clust_exc_cause=cause)) count += 1 log.debug(" {0} genes excluded".format(count)) log.info(" OncodriveCLUST results ...") with tsv.open(oclust["results"], "r") as f: types = (str, str, float, float, float) columns = ("GENE", "CLUST_COORDS", "ZSCORE", "PVALUE", "QVALUE") for gene, coords, zscore, pvalue, qvalue in tsv.lines(f, types, columns=columns, header=True, null_value="NA"): projdb.update_gene(Gene(id=gene, clust_coords=coords, clust_zscore=zscore, clust_pvalue=pvalue, clust_qvalue=qvalue, clust_exc_cause=ProjectDb.NO_GENE_EXC)) projdb.commit() projdb.close() projects_out_port.send(project)
def scan_files(project): log = task.logger conf = task.conf projects_port, liftover_projects_port = task.ports("projects_out", "liftover_projects") project_id = project["id"] temp_path = project["temp_path"] project_path = project["path"] projdb_path = project["db"] assembly = project["assembly"] log.info("--- [{0}] --------------------------------------------".format(project_id)) if assembly == "hg18": out_port = liftover_projects_port elif assembly == "hg19": out_port = projects_port else: raise Exception("Unexpected assembly: {0}".format(assembly)) #if os.path.exists(projdb_path): # log.warn("Variations database already created, skipping this step.") # out_port.send(project) # return if os.path.exists(projdb_path): os.remove(projdb_path) log.info("Creating variants database ...") projdb_tmp_path = make_temp_file(task, suffix=".db") log.debug(projdb_tmp_path) projdb = ProjectDb(projdb_tmp_path) projdb.create() data_path = conf["data_path"] log.info("Loading genes ...") projdb.load_genes(get_data_ensembl_genes_path(conf)) log.info("Loading pathways ...") projdb.load_pathways( get_data_kegg_def_path(conf), get_data_kegg_ensg_map_path(conf)) log.info("Parsing variants ...") for file in project["files"]: if not os.path.isabs(file): raise InternalError("Non absolute path found: {0}".format(file)) if not os.path.exists(file): raise Exception("Input file not found: {0}".format(file)) if not os.path.isfile(file): raise Exception("Not a file: {0}".format(file)) for container_name, path, name, ext, f in archived_files(file): fname = os.path.join(path, name + ext) if container_name is not None: source_name = "{0}:{1}".format(os.path.basename(container_name), fname) else: source_name = name + ext log.info("=> {0} ...".format(source_name)) sample_id = os.path.basename(name) if ext.lower() in _SUPPORTED_EXTENSIONS: parser_type = ext[1:] else: parser_type = "tab" parser = create_variants_parser(parser_type, f, source_name, sample_id) source_id = projdb.add_source(source_name) var_ids = set() for var in parser: for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) var_id = projdb.add_variant(var, source_id=source_id, line_num=parser.get_line_num()) var_ids.add(var_id) for line_num, text in parser.read_lines(): projdb.add_source_line(source_id, line_num, text) num_variants = len(var_ids) log.info(" {0} variants".format(num_variants)) if num_variants == 0: raise Exception("No variants found in source '{}'. " "Please check the documentation for the expected input for '{}' format.".format( source_name, parser.name)) projdb.commit() projdb.close() log.info("Copying variants database ...") log.debug("{0} -> {1}".format(projdb_tmp_path, projdb_path)) shutil.copy(projdb_tmp_path, projdb_path) remove_temp(task, projdb_tmp_path) out_port.send(project)
def end(): log = task.logger projects_out_port = task.ports("projects_out") log.info("Updating the projects database ...") for project_id, projects in task.context.items(): log.info("[{0}]".format(project_id)) for index, project in enumerate(projects): projdb = ProjectDb(project["db"]) if index == 0: log.info(" Functional impact ...") projdb.delete_sample_gene_fimpact() with tsv.open(project["sample_gene_fi_data"], "r") as f: types = (int, str, float, float, int, float, float, int, float, float, int) for fields in tsv.lines(f, types, header=True, null_value="-"): projdb.add_sample_gene_fimpact(*fields) ofm = project["oncodrivefm"] del project["oncodrivefm"] exc_path = os.path.join(project["temp_path"], "oncodrivefm-excluded-cause.tsv") log.info(" Excluded gene causes ...") log.debug(" > {0}".format(exc_path)) count = 0 with tsv.open(exc_path, "r") as exf: for gene, cause in tsv.lines(exf, (str, str), header=True): projdb.update_gene(Gene(id=gene, fm_exc_cause=cause)) count += 1 log.debug(" {0} genes excluded".format(count)) for feature, results_path in ofm: log.info(" {0} ...".format(feature)) log.debug(" > {0}".format(results_path)) if feature == "genes": with tsv.open(results_path, "r") as f: count = 0 for gene, pvalue, qvalue in tsv.lines(f, (str, float, float), header=True): projdb.update_gene( Gene(id=gene, fm_pvalue=pvalue, fm_qvalue=qvalue, fm_exc_cause=ProjectDb.NO_GENE_EXC) ) count += 1 log.info(" {0} genes".format(count)) elif feature == "pathways": with tsv.open(results_path, "r") as f: count = 0 for pathway, zscore, pvalue, qvalue in tsv.lines(f, (str, float, float, float), header=True): projdb.update_pathway( Pathway(id=pathway, fm_zscore=zscore, fm_pvalue=pvalue, fm_qvalue=qvalue) ) count += 1 log.info(" {0} pathways".format(count)) projdb.commit() projdb.close() projects_out_port.send(projects[0])
def liftover(project): log = task.logger conf = task.conf lifted_project_port = task.ports("lifted_projects") log.info("--- [{0}] --------------------------------------------".format(project["id"])) log.info("Preparing liftOver files ...") in_path = make_temp_file(task, suffix=".bed") in_file = open(in_path, "w") out_path = make_temp_file(task, suffix=".bed") unmapped_path = os.path.join(project["temp_path"], "liftover_unmapped.bed") projdb = ProjectDb(project["db"]) for var in projdb.variants(order_by="position"): in_file.write(tsv.line_text("chr" + var.chr, var.start, var.start + len(var.ref), var.id)) in_file.close() log.info("Running liftOver ...") project["from_assembly"] = project["assembly"] project["assembly"] = "hg19" cmd = " ".join([ conf["liftover_bin"], in_path, os.path.join(conf["liftover_chain_path"], "hg18ToHg19.over.chain"), out_path, unmapped_path ]) log.debug(cmd) subprocess.call(cmd, shell=True) log.info("Annotating unmapped variants ...") count = 0 with open(unmapped_path, "r") as f: for line in f: if line.lstrip().startswith("#"): continue fields = line.rstrip().split("\t") var_id = int(fields[3]) projdb.update_variant_start(var_id, start=None) count += 1 log.info(" {0} unmapped variants annotated".format(count)) log.info("Updating variants ...") count = 0 with open(out_path, "r") as f: for line in f: fields = line.rstrip().split("\t") chr, start, end, var_id = fields projdb.update_variant_start(var_id, start=start) count += 1 log.info(" {0} variants".format(count)) remove_temp(task, in_path, out_path) projdb.commit() projdb.close() lifted_project_port.send(project)